summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/env
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/env
parentInitial commit. (diff)
downloadceph-b26c4052f3542036551aa9dec9caa4226e456195.tar.xz
ceph-b26c4052f3542036551aa9dec9caa4226e456195.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/env')
-rw-r--r--src/rocksdb/env/composite_env.cc544
-rw-r--r--src/rocksdb/env/composite_env_wrapper.h380
-rw-r--r--src/rocksdb/env/emulated_clock.h114
-rw-r--r--src/rocksdb/env/env.cc1264
-rw-r--r--src/rocksdb/env/env_basic_test.cc401
-rw-r--r--src/rocksdb/env/env_chroot.cc148
-rw-r--r--src/rocksdb/env/env_chroot.h55
-rw-r--r--src/rocksdb/env/env_encryption.cc1351
-rw-r--r--src/rocksdb/env/env_encryption_ctr.h116
-rw-r--r--src/rocksdb/env/env_posix.cc520
-rw-r--r--src/rocksdb/env/env_test.cc3562
-rw-r--r--src/rocksdb/env/file_system.cc290
-rw-r--r--src/rocksdb/env/file_system_tracer.cc564
-rw-r--r--src/rocksdb/env/file_system_tracer.h461
-rw-r--r--src/rocksdb/env/fs_posix.cc1294
-rw-r--r--src/rocksdb/env/fs_readonly.h107
-rw-r--r--src/rocksdb/env/fs_remap.cc343
-rw-r--r--src/rocksdb/env/fs_remap.h139
-rw-r--r--src/rocksdb/env/io_posix.cc1733
-rw-r--r--src/rocksdb/env/io_posix.h523
-rw-r--r--src/rocksdb/env/io_posix_test.cc141
-rw-r--r--src/rocksdb/env/mock_env.cc1070
-rw-r--r--src/rocksdb/env/mock_env.h144
-rw-r--r--src/rocksdb/env/mock_env_test.cc84
-rw-r--r--src/rocksdb/env/unique_id_gen.cc164
-rw-r--r--src/rocksdb/env/unique_id_gen.h71
26 files changed, 15583 insertions, 0 deletions
diff --git a/src/rocksdb/env/composite_env.cc b/src/rocksdb/env/composite_env.cc
new file mode 100644
index 000000000..b93aa9fcb
--- /dev/null
+++ b/src/rocksdb/env/composite_env.cc
@@ -0,0 +1,544 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "env/composite_env_wrapper.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// The CompositeEnvWrapper class provides an interface that is compatible
+// with the old monolithic Env API, and an implementation that wraps around
+// the new Env that provides threading and other OS related functionality, and
+// the new FileSystem API that provides storage functionality. By
+// providing the old Env interface, it allows the rest of RocksDB code to
+// be agnostic of whether the underlying Env implementation is a monolithic
+// Env or an Env + FileSystem. In the former case, the user will specify
+// Options::env only, whereas in the latter case, the user will specify
+// Options::env and Options::file_system.
+
+class CompositeSequentialFileWrapper : public SequentialFile {
+ public:
+ explicit CompositeSequentialFileWrapper(
+ std::unique_ptr<FSSequentialFile>& target)
+ : target_(std::move(target)) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Read(n, io_opts, result, scratch, &dbg);
+ }
+ Status Skip(uint64_t n) override { return target_->Skip(n); }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+ Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+ char* scratch) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg);
+ }
+
+ private:
+ std::unique_ptr<FSSequentialFile> target_;
+};
+
+class CompositeRandomAccessFileWrapper : public RandomAccessFile {
+ public:
+ explicit CompositeRandomAccessFileWrapper(
+ std::unique_ptr<FSRandomAccessFile>& target)
+ : target_(std::move(target)) {}
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+ }
+ Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ std::vector<FSReadRequest> fs_reqs;
+ Status status;
+
+ fs_reqs.resize(num_reqs);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ fs_reqs[i].offset = reqs[i].offset;
+ fs_reqs[i].len = reqs[i].len;
+ fs_reqs[i].scratch = reqs[i].scratch;
+ fs_reqs[i].status = IOStatus::OK();
+ }
+ status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ reqs[i].result = fs_reqs[i].result;
+ reqs[i].status = fs_reqs[i].status;
+ }
+ return status;
+ }
+ Status Prefetch(uint64_t offset, size_t n) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Prefetch(offset, n, io_opts, &dbg);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+ void Hint(AccessPattern pattern) override {
+ target_->Hint((FSRandomAccessFile::AccessPattern)pattern);
+ }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ private:
+ std::unique_ptr<FSRandomAccessFile> target_;
+};
+
+class CompositeWritableFileWrapper : public WritableFile {
+ public:
+ explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t)
+ : target_(std::move(t)) {}
+
+ Status Append(const Slice& data) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Append(data, io_opts, &dbg);
+ }
+ Status Append(const Slice& data,
+ const DataVerificationInfo& verification_info) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Append(data, io_opts, verification_info, &dbg);
+ }
+ Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->PositionedAppend(data, offset, io_opts, &dbg);
+ }
+ Status PositionedAppend(
+ const Slice& data, uint64_t offset,
+ const DataVerificationInfo& verification_info) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->PositionedAppend(data, offset, io_opts, verification_info,
+ &dbg);
+ }
+ Status Truncate(uint64_t size) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Truncate(size, io_opts, &dbg);
+ }
+ Status Close() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Close(io_opts, &dbg);
+ }
+ Status Flush() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Flush(io_opts, &dbg);
+ }
+ Status Sync() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Sync(io_opts, &dbg);
+ }
+ Status Fsync() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Fsync(io_opts, &dbg);
+ }
+ bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+ target_->SetWriteLifeTimeHint(hint);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ return target_->GetWriteLifeTimeHint();
+ }
+
+ uint64_t GetFileSize() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->GetFileSize(io_opts, &dbg);
+ }
+
+ void SetPreallocationBlockSize(size_t size) override {
+ target_->SetPreallocationBlockSize(size);
+ }
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ target_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ Status InvalidateCache(size_t offset, size_t length) override {
+ return target_->InvalidateCache(offset, length);
+ }
+
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->RangeSync(offset, nbytes, io_opts, &dbg);
+ }
+
+ void PrepareWrite(size_t offset, size_t len) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ target_->PrepareWrite(offset, len, io_opts, &dbg);
+ }
+
+ Status Allocate(uint64_t offset, uint64_t len) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Allocate(offset, len, io_opts, &dbg);
+ }
+
+ std::unique_ptr<FSWritableFile>* target() { return &target_; }
+
+ private:
+ std::unique_ptr<FSWritableFile> target_;
+};
+
+class CompositeRandomRWFileWrapper : public RandomRWFile {
+ public:
+ explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target)
+ : target_(std::move(target)) {}
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ Status Write(uint64_t offset, const Slice& data) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Write(offset, data, io_opts, &dbg);
+ }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+ }
+ Status Flush() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Flush(io_opts, &dbg);
+ }
+ Status Sync() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Sync(io_opts, &dbg);
+ }
+ Status Fsync() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Fsync(io_opts, &dbg);
+ }
+ Status Close() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Close(io_opts, &dbg);
+ }
+
+ private:
+ std::unique_ptr<FSRandomRWFile> target_;
+};
+
+class CompositeDirectoryWrapper : public Directory {
+ public:
+ explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target)
+ : target_(std::move(target)) {}
+
+ Status Fsync() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions());
+ }
+
+ Status Close() override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return target_->Close(io_opts, &dbg);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ private:
+ std::unique_ptr<FSDirectory> target_;
+};
+} // namespace
+
+Status CompositeEnv::NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) {
+ IODebugContext dbg;
+ std::unique_ptr<FSSequentialFile> file;
+ Status status;
+ status =
+ file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg);
+ if (status.ok()) {
+ r->reset(new CompositeSequentialFileWrapper(file));
+ }
+ return status;
+}
+
+Status CompositeEnv::NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) {
+ IODebugContext dbg;
+ std::unique_ptr<FSRandomAccessFile> file;
+ Status status;
+ status =
+ file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg);
+ if (status.ok()) {
+ r->reset(new CompositeRandomAccessFileWrapper(file));
+ }
+ return status;
+}
+
+Status CompositeEnv::NewWritableFile(const std::string& f,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) {
+ IODebugContext dbg;
+ std::unique_ptr<FSWritableFile> file;
+ Status status;
+ status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg);
+ if (status.ok()) {
+ r->reset(new CompositeWritableFileWrapper(file));
+ }
+ return status;
+}
+
+Status CompositeEnv::ReopenWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) {
+ IODebugContext dbg;
+ Status status;
+ std::unique_ptr<FSWritableFile> file;
+ status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file,
+ &dbg);
+ if (status.ok()) {
+ result->reset(new CompositeWritableFileWrapper(file));
+ }
+ return status;
+}
+
+Status CompositeEnv::ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) {
+ IODebugContext dbg;
+ Status status;
+ std::unique_ptr<FSWritableFile> file;
+ status = file_system_->ReuseWritableFile(fname, old_fname,
+ FileOptions(options), &file, &dbg);
+ if (status.ok()) {
+ r->reset(new CompositeWritableFileWrapper(file));
+ }
+ return status;
+}
+
+Status CompositeEnv::NewRandomRWFile(const std::string& fname,
+ std::unique_ptr<RandomRWFile>* result,
+ const EnvOptions& options) {
+ IODebugContext dbg;
+ std::unique_ptr<FSRandomRWFile> file;
+ Status status;
+ status =
+ file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg);
+ if (status.ok()) {
+ result->reset(new CompositeRandomRWFileWrapper(file));
+ }
+ return status;
+}
+
+Status CompositeEnv::NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ std::unique_ptr<FSDirectory> dir;
+ Status status;
+ status = file_system_->NewDirectory(name, io_opts, &dir, &dbg);
+ if (status.ok()) {
+ result->reset(new CompositeDirectoryWrapper(dir));
+ }
+ return status;
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> env_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+ {"target",
+ OptionTypeInfo(0, OptionType::kUnknown, OptionVerificationType::kByName,
+ OptionTypeFlags::kDontSerialize)
+ .SetParseFunc([](const ConfigOptions& opts,
+ const std::string& /*name*/, const std::string& value,
+ void* addr) {
+ auto target = static_cast<EnvWrapper::Target*>(addr);
+ return Env::CreateFromString(opts, value, &(target->env),
+ &(target->guard));
+ })
+ .SetEqualsFunc([](const ConfigOptions& opts,
+ const std::string& /*name*/, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto target1 = static_cast<const EnvWrapper::Target*>(addr1);
+ const auto target2 = static_cast<const EnvWrapper::Target*>(addr2);
+ if (target1->env != nullptr) {
+ return target1->env->AreEquivalent(opts, target2->env, mismatch);
+ } else {
+ return (target2->env == nullptr);
+ }
+ })
+ .SetPrepareFunc([](const ConfigOptions& opts,
+ const std::string& /*name*/, void* addr) {
+ auto target = static_cast<EnvWrapper::Target*>(addr);
+ if (target->guard.get() != nullptr) {
+ target->env = target->guard.get();
+ } else if (target->env == nullptr) {
+ target->env = Env::Default();
+ }
+ return target->env->PrepareOptions(opts);
+ })
+ .SetValidateFunc([](const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts,
+ const std::string& /*name*/, const void* addr) {
+ const auto target = static_cast<const EnvWrapper::Target*>(addr);
+ if (target->env == nullptr) {
+ return Status::InvalidArgument("Target Env not specified");
+ } else {
+ return target->env->ValidateOptions(db_opts, cf_opts);
+ }
+ })},
+#endif // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo>
+ composite_fs_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+ {"file_system",
+ OptionTypeInfo::AsCustomSharedPtr<FileSystem>(
+ 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ composite_clock_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+ {"clock",
+ OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+ 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif // ROCKSDB_LITE
+};
+
+} // namespace
+
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs) {
+ return std::unique_ptr<Env>(new CompositeEnvWrapper(Env::Default(), fs));
+}
+
+CompositeEnvWrapper::CompositeEnvWrapper(Env* env,
+ const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& sc)
+ : CompositeEnv(fs, sc), target_(env) {
+ RegisterOptions("", &target_, &env_wrapper_type_info);
+ RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info);
+ RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info);
+}
+
+CompositeEnvWrapper::CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+ const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& sc)
+ : CompositeEnv(fs, sc), target_(env) {
+ RegisterOptions("", &target_, &env_wrapper_type_info);
+ RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info);
+ RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info);
+}
+
+Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) {
+ target_.Prepare();
+ if (file_system_ == nullptr) {
+ file_system_ = target_.env->GetFileSystem();
+ }
+ if (system_clock_ == nullptr) {
+ system_clock_ = target_.env->GetSystemClock();
+ }
+ return Env::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string CompositeEnvWrapper::SerializeOptions(
+ const ConfigOptions& config_options, const std::string& header) const {
+ auto options = CompositeEnv::SerializeOptions(config_options, header);
+ if (target_.env != nullptr && target_.env != Env::Default()) {
+ options.append("target=");
+ options.append(target_.env->ToString(config_options));
+ }
+ return options;
+}
+#endif // ROCKSDB_LITE
+
+EnvWrapper::EnvWrapper(Env* t) : target_(t) {
+ RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::EnvWrapper(std::unique_ptr<Env>&& t) : target_(std::move(t)) {
+ RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::EnvWrapper(const std::shared_ptr<Env>& t) : target_(t) {
+ RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::~EnvWrapper() {}
+
+Status EnvWrapper::PrepareOptions(const ConfigOptions& options) {
+ target_.Prepare();
+ return Env::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options,
+ const std::string& header) const {
+ auto parent = Env::SerializeOptions(config_options, "");
+ if (config_options.IsShallow() || target_.env == nullptr ||
+ target_.env == Env::Default()) {
+ return parent;
+ } else {
+ std::string result = header;
+ if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+ result.append(OptionTypeInfo::kIdPropName()).append("=");
+ }
+ result.append(parent);
+ if (!EndsWith(result, config_options.delimiter)) {
+ result.append(config_options.delimiter);
+ }
+ result.append("target=").append(target_.env->ToString(config_options));
+ return result;
+ }
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/composite_env_wrapper.h b/src/rocksdb/env/composite_env_wrapper.h
new file mode 100644
index 000000000..78da6f0ed
--- /dev/null
+++ b/src/rocksdb/env/composite_env_wrapper.h
@@ -0,0 +1,380 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompositeEnv : public Env {
+ public:
+ // Initialize a CompositeEnvWrapper that delegates all thread/time related
+ // calls to env, and all file operations to fs
+ explicit CompositeEnv(const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& clock)
+ : Env(fs, clock) {}
+
+ Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+ return file_system_->RegisterDbPaths(paths);
+ }
+ Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+ return file_system_->UnregisterDbPaths(paths);
+ }
+
+ // The following text is boilerplate that forwards all methods to target()
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) override;
+
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) override;
+
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override;
+
+ Status ReopenWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) override;
+
+ Status ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) override;
+
+ Status NewRandomRWFile(const std::string& fname,
+ std::unique_ptr<RandomRWFile>* result,
+ const EnvOptions& options) override;
+
+ Status NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ return file_system_->NewMemoryMappedFileBuffer(fname, result);
+ }
+
+ Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override;
+
+ Status FileExists(const std::string& f) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->FileExists(f, io_opts, &dbg);
+ }
+ Status GetChildren(const std::string& dir,
+ std::vector<std::string>* r) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetChildren(dir, io_opts, r, &dbg);
+ }
+ Status GetChildrenFileAttributes(
+ const std::string& dir, std::vector<FileAttributes>* result) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetChildrenFileAttributes(dir, io_opts, result, &dbg);
+ }
+ Status DeleteFile(const std::string& f) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->DeleteFile(f, io_opts, &dbg);
+ }
+ Status Truncate(const std::string& fname, size_t size) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->Truncate(fname, size, io_opts, &dbg);
+ }
+ Status CreateDir(const std::string& d) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->CreateDir(d, io_opts, &dbg);
+ }
+ Status CreateDirIfMissing(const std::string& d) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->CreateDirIfMissing(d, io_opts, &dbg);
+ }
+ Status DeleteDir(const std::string& d) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->DeleteDir(d, io_opts, &dbg);
+ }
+ Status GetFileSize(const std::string& f, uint64_t* s) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetFileSize(f, io_opts, s, &dbg);
+ }
+
+ Status GetFileModificationTime(const std::string& fname,
+ uint64_t* file_mtime) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetFileModificationTime(fname, io_opts, file_mtime,
+ &dbg);
+ }
+
+ Status RenameFile(const std::string& s, const std::string& t) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->RenameFile(s, t, io_opts, &dbg);
+ }
+
+ Status LinkFile(const std::string& s, const std::string& t) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->LinkFile(s, t, io_opts, &dbg);
+ }
+
+ Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->NumFileLinks(fname, io_opts, count, &dbg);
+ }
+
+ Status AreFilesSame(const std::string& first, const std::string& second,
+ bool* res) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->AreFilesSame(first, second, io_opts, res, &dbg);
+ }
+
+ Status LockFile(const std::string& f, FileLock** l) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->LockFile(f, io_opts, l, &dbg);
+ }
+
+ Status UnlockFile(FileLock* l) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->UnlockFile(l, io_opts, &dbg);
+ }
+
+ Status GetAbsolutePath(const std::string& db_path,
+ std::string* output_path) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetAbsolutePath(db_path, io_opts, output_path, &dbg);
+ }
+
+ Status NewLogger(const std::string& fname,
+ std::shared_ptr<Logger>* result) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->NewLogger(fname, io_opts, result, &dbg);
+ }
+
+ Status IsDirectory(const std::string& path, bool* is_dir) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->IsDirectory(path, io_opts, is_dir, &dbg);
+ }
+
+ Status GetTestDirectory(std::string* path) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetTestDirectory(io_opts, path, &dbg);
+ }
+
+ EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+ return file_system_->OptimizeForLogRead(FileOptions(env_options));
+ }
+
+ EnvOptions OptimizeForManifestRead(
+ const EnvOptions& env_options) const override {
+ return file_system_->OptimizeForManifestRead(FileOptions(env_options));
+ }
+
+ EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+ const DBOptions& db_options) const override {
+ return file_system_->OptimizeForLogWrite(FileOptions(env_options),
+ db_options);
+ }
+
+ EnvOptions OptimizeForManifestWrite(
+ const EnvOptions& env_options) const override {
+ return file_system_->OptimizeForManifestWrite(FileOptions(env_options));
+ }
+
+ EnvOptions OptimizeForCompactionTableWrite(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& immutable_ops) const override {
+ return file_system_->OptimizeForCompactionTableWrite(
+ FileOptions(env_options), immutable_ops);
+ }
+ EnvOptions OptimizeForCompactionTableRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const override {
+ return file_system_->OptimizeForCompactionTableRead(
+ FileOptions(env_options), db_options);
+ }
+ EnvOptions OptimizeForBlobFileRead(
+ const EnvOptions& env_options,
+ const ImmutableDBOptions& db_options) const override {
+ return file_system_->OptimizeForBlobFileRead(FileOptions(env_options),
+ db_options);
+ }
+ // This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+ Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+ IOOptions io_opts;
+ IODebugContext dbg;
+ return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg);
+ }
+ uint64_t NowMicros() override { return system_clock_->NowMicros(); }
+ uint64_t NowNanos() override { return system_clock_->NowNanos(); }
+
+ uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); }
+
+ void SleepForMicroseconds(int micros) override {
+ system_clock_->SleepForMicroseconds(micros);
+ }
+
+ Status GetCurrentTime(int64_t* unix_time) override {
+ return system_clock_->GetCurrentTime(unix_time);
+ }
+ std::string TimeToString(uint64_t time) override {
+ return system_clock_->TimeToString(time);
+ }
+};
+
+class CompositeEnvWrapper : public CompositeEnv {
+ public:
+ // Initialize a CompositeEnvWrapper that delegates all thread/time related
+ // calls to env, and all file operations to fs
+ explicit CompositeEnvWrapper(Env* env)
+ : CompositeEnvWrapper(env, env->GetFileSystem(), env->GetSystemClock()) {}
+ explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs)
+ : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
+
+ explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<SystemClock>& sc)
+ : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
+
+ explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& sc);
+
+ explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+ const std::shared_ptr<FileSystem>& fs)
+ : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
+
+ explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+ const std::shared_ptr<SystemClock>& sc)
+ : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
+
+ explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+ const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& sc);
+
+ static const char* kClassName() { return "CompositeEnv"; }
+ const char* Name() const override { return kClassName(); }
+ bool IsInstanceOf(const std::string& name) const override {
+ if (name == kClassName()) {
+ return true;
+ } else {
+ return CompositeEnv::IsInstanceOf(name);
+ }
+ }
+ const Customizable* Inner() const override { return target_.env; }
+
+ Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& config_options,
+ const std::string& header) const override;
+#endif // ROCKSDB_LITE
+
+ // Return the target to which this Env forwards all calls
+ Env* env_target() const { return target_.env; }
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+ Status LoadLibrary(const std::string& lib_name,
+ const std::string& search_path,
+ std::shared_ptr<DynamicLibrary>* result) override {
+ return target_.env->LoadLibrary(lib_name, search_path, result);
+ }
+#endif
+
+ void Schedule(void (*f)(void* arg), void* a, Priority pri,
+ void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+ return target_.env->Schedule(f, a, pri, tag, u);
+ }
+
+ int UnSchedule(void* tag, Priority pri) override {
+ return target_.env->UnSchedule(tag, pri);
+ }
+
+ void StartThread(void (*f)(void*), void* a) override {
+ return target_.env->StartThread(f, a);
+ }
+ void WaitForJoin() override { return target_.env->WaitForJoin(); }
+ unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+ return target_.env->GetThreadPoolQueueLen(pri);
+ }
+
+ int ReserveThreads(int threads_to_be_reserved, Priority pri) override {
+ return target_.env->ReserveThreads(threads_to_be_reserved, pri);
+ }
+
+ int ReleaseThreads(int threads_to_be_released, Priority pri) override {
+ return target_.env->ReleaseThreads(threads_to_be_released, pri);
+ }
+
+ Status GetHostName(char* name, uint64_t len) override {
+ return target_.env->GetHostName(name, len);
+ }
+ void SetBackgroundThreads(int num, Priority pri) override {
+ return target_.env->SetBackgroundThreads(num, pri);
+ }
+ int GetBackgroundThreads(Priority pri) override {
+ return target_.env->GetBackgroundThreads(pri);
+ }
+
+ Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+ return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
+ }
+
+ void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+ return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
+ }
+
+ void LowerThreadPoolIOPriority(Priority pool) override {
+ target_.env->LowerThreadPoolIOPriority(pool);
+ }
+
+ void LowerThreadPoolCPUPriority(Priority pool) override {
+ target_.env->LowerThreadPoolCPUPriority(pool);
+ }
+
+ Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+ return target_.env->LowerThreadPoolCPUPriority(pool, pri);
+ }
+
+ Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+ return target_.env->GetThreadList(thread_list);
+ }
+
+ ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+ return target_.env->GetThreadStatusUpdater();
+ }
+
+ uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
+
+ std::string GenerateUniqueId() override {
+ return target_.env->GenerateUniqueId();
+ }
+
+ private:
+ EnvWrapper::Target target_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/emulated_clock.h b/src/rocksdb/env/emulated_clock.h
new file mode 100644
index 000000000..622737635
--- /dev/null
+++ b/src/rocksdb/env/emulated_clock.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A SystemClock that can "mock" sleep and counts its operations.
+class EmulatedSystemClock : public SystemClockWrapper {
+ private:
+ // Something to return when mocking current time
+ const int64_t maybe_starting_time_;
+ std::atomic<int> sleep_counter_{0};
+ std::atomic<int> cpu_counter_{0};
+ std::atomic<int64_t> addon_microseconds_{0};
+ // Do not modify in the env of a running DB (could cause deadlock)
+ std::atomic<bool> time_elapse_only_sleep_;
+ bool no_slowdown_;
+
+ public:
+ explicit EmulatedSystemClock(const std::shared_ptr<SystemClock>& base,
+ bool time_elapse_only_sleep = false);
+
+ static const char* kClassName() { return "TimeEmulatedSystemClock"; }
+ const char* Name() const override { return kClassName(); }
+
+ virtual void SleepForMicroseconds(int micros) override {
+ sleep_counter_++;
+ if (no_slowdown_ || time_elapse_only_sleep_) {
+ addon_microseconds_.fetch_add(micros);
+ }
+ if (!no_slowdown_) {
+ SystemClockWrapper::SleepForMicroseconds(micros);
+ }
+ }
+
+ void MockSleepForMicroseconds(int64_t micros) {
+ sleep_counter_++;
+ assert(no_slowdown_);
+ addon_microseconds_.fetch_add(micros);
+ }
+
+ void MockSleepForSeconds(int64_t seconds) {
+ sleep_counter_++;
+ assert(no_slowdown_);
+ addon_microseconds_.fetch_add(seconds * 1000000);
+ }
+
+ void SetTimeElapseOnlySleep(bool enabled) {
+ // We cannot set these before destroying the last DB because they might
+ // cause a deadlock or similar without the appropriate options set in
+ // the DB.
+ time_elapse_only_sleep_ = enabled;
+ no_slowdown_ = enabled;
+ }
+
+ bool IsTimeElapseOnlySleep() const { return time_elapse_only_sleep_.load(); }
+ void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+ bool IsMockSleepEnabled() const { return no_slowdown_; }
+
+ int GetSleepCounter() const { return sleep_counter_.load(); }
+
+ virtual Status GetCurrentTime(int64_t* unix_time) override {
+ Status s;
+ if (time_elapse_only_sleep_) {
+ *unix_time = maybe_starting_time_;
+ } else {
+ s = SystemClockWrapper::GetCurrentTime(unix_time);
+ }
+ if (s.ok()) {
+ // mock microseconds elapsed to seconds of time
+ *unix_time += addon_microseconds_.load() / 1000000;
+ }
+ return s;
+ }
+
+ virtual uint64_t CPUNanos() override {
+ cpu_counter_++;
+ return SystemClockWrapper::CPUNanos();
+ }
+
+ virtual uint64_t CPUMicros() override {
+ cpu_counter_++;
+ return SystemClockWrapper::CPUMicros();
+ }
+
+ virtual uint64_t NowNanos() override {
+ return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) +
+ addon_microseconds_.load() * 1000;
+ }
+
+ virtual uint64_t NowMicros() override {
+ return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) +
+ addon_microseconds_.load();
+ }
+
+ int GetCpuCounter() const { return cpu_counter_.load(); }
+
+ void ResetCounters() {
+ cpu_counter_.store(0);
+ sleep_counter_.store(0);
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/env.cc b/src/rocksdb/env/env.cc
new file mode 100644
index 000000000..f70d1f067
--- /dev/null
+++ b/src/rocksdb/env/env.cc
@@ -0,0 +1,1264 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+
+#include <thread>
+
+#include "env/composite_env_wrapper.h"
+#include "env/emulated_clock.h"
+#include "env/mock_env.h"
+#include "env/unique_id_gen.h"
+#include "logging/env_logger.h"
+#include "memory/arena.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinEnvs(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<Env>(MockEnv::kClassName(), [](const std::string& /*uri*/,
+ std::unique_ptr<Env>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(MockEnv::Create(Env::Default()));
+ return guard->get();
+ });
+ library.AddFactory<Env>(
+ CompositeEnvWrapper::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new CompositeEnvWrapper(Env::Default()));
+ return guard->get();
+ });
+ size_t num_types;
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif // ROCKSDB_LITE
+
+static void RegisterSystemEnvs() {
+#ifndef ROCKSDB_LITE
+ static std::once_flag loaded;
+ std::call_once(loaded, [&]() {
+ RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+}
+
+class LegacySystemClock : public SystemClock {
+ private:
+ Env* env_;
+
+ public:
+ explicit LegacySystemClock(Env* env) : env_(env) {}
+ const char* Name() const override { return "LegacySystemClock"; }
+
+ // Returns the number of micro-seconds since some fixed point in time.
+ // It is often used as system time such as in GenericRateLimiter
+ // and other places so a port needs to return system time in order to work.
+ uint64_t NowMicros() override { return env_->NowMicros(); }
+
+ // Returns the number of nano-seconds since some fixed point in time. Only
+ // useful for computing deltas of time in one run.
+ // Default implementation simply relies on NowMicros.
+ // In platform-specific implementations, NowNanos() should return time points
+ // that are MONOTONIC.
+ uint64_t NowNanos() override { return env_->NowNanos(); }
+
+ uint64_t CPUMicros() override { return CPUNanos() / 1000; }
+ uint64_t CPUNanos() override { return env_->NowCPUNanos(); }
+
+ // Sleep/delay the thread for the prescribed number of micro-seconds.
+ void SleepForMicroseconds(int micros) override {
+ env_->SleepForMicroseconds(micros);
+ }
+
+ // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+ // Only overwrites *unix_time on success.
+ Status GetCurrentTime(int64_t* unix_time) override {
+ return env_->GetCurrentTime(unix_time);
+ }
+ // Converts seconds-since-Jan-01-1970 to a printable string
+ std::string TimeToString(uint64_t time) override {
+ return env_->TimeToString(time);
+ }
+
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& /*config_options*/,
+ const std::string& /*prefix*/) const override {
+ // We do not want the LegacySystemClock to appear in the serialized output.
+ // This clock is an internal class for those who do not implement one and
+ // would be part of the Env. As such, do not serialize it here.
+ return "";
+ }
+#endif // ROCKSDB_LITE
+};
+
+class LegacySequentialFileWrapper : public FSSequentialFile {
+ public:
+ explicit LegacySequentialFileWrapper(
+ std::unique_ptr<SequentialFile>&& _target)
+ : target_(std::move(_target)) {}
+
+ IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result,
+ char* scratch, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Read(n, result, scratch));
+ }
+ IOStatus Skip(uint64_t n) override {
+ return status_to_io_status(target_->Skip(n));
+ }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return status_to_io_status(target_->InvalidateCache(offset, length));
+ }
+ IOStatus PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(
+ target_->PositionedRead(offset, n, result, scratch));
+ }
+
+ private:
+ std::unique_ptr<SequentialFile> target_;
+};
+
+class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+ explicit LegacyRandomAccessFileWrapper(
+ std::unique_ptr<RandomAccessFile>&& target)
+ : target_(std::move(target)) {}
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) const override {
+ return status_to_io_status(target_->Read(offset, n, result, scratch));
+ }
+
+ IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ std::vector<ReadRequest> reqs;
+ Status status;
+
+ reqs.reserve(num_reqs);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ ReadRequest req;
+
+ req.offset = fs_reqs[i].offset;
+ req.len = fs_reqs[i].len;
+ req.scratch = fs_reqs[i].scratch;
+ req.status = Status::OK();
+
+ reqs.emplace_back(req);
+ }
+ status = target_->MultiRead(reqs.data(), num_reqs);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ fs_reqs[i].result = reqs[i].result;
+ fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status));
+ }
+ return status_to_io_status(std::move(status));
+ }
+
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Prefetch(offset, n));
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+ void Hint(AccessPattern pattern) override {
+ target_->Hint((RandomAccessFile::AccessPattern)pattern);
+ }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return status_to_io_status(target_->InvalidateCache(offset, length));
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+};
+
+class LegacyRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+ explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target)
+ : target_(std::move(target)) {}
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ IOStatus Write(uint64_t offset, const Slice& data,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Write(offset, data));
+ }
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) const override {
+ return status_to_io_status(target_->Read(offset, n, result, scratch));
+ }
+ IOStatus Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Flush());
+ }
+ IOStatus Sync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Sync());
+ }
+ IOStatus Fsync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Fsync());
+ }
+ IOStatus Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Close());
+ }
+
+ private:
+ std::unique_ptr<RandomRWFile> target_;
+};
+
+class LegacyWritableFileWrapper : public FSWritableFile {
+ public:
+ explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target)
+ : target_(std::move(_target)) {}
+
+ IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Append(data));
+ }
+ IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+ const DataVerificationInfo& /*verification_info*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Append(data));
+ }
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->PositionedAppend(data, offset));
+ }
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& /*options*/,
+ const DataVerificationInfo& /*verification_info*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->PositionedAppend(data, offset));
+ }
+ IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Truncate(size));
+ }
+ IOStatus Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Close());
+ }
+ IOStatus Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Flush());
+ }
+ IOStatus Sync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Sync());
+ }
+ IOStatus Fsync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Fsync());
+ }
+ bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+ target_->SetWriteLifeTimeHint(hint);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ return target_->GetWriteLifeTimeHint();
+ }
+
+ uint64_t GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return target_->GetFileSize();
+ }
+
+ void SetPreallocationBlockSize(size_t size) override {
+ target_->SetPreallocationBlockSize(size);
+ }
+
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ target_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override {
+ return status_to_io_status(target_->InvalidateCache(offset, length));
+ }
+
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->RangeSync(offset, nbytes));
+ }
+
+ void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ target_->PrepareWrite(offset, len);
+ }
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Allocate(offset, len));
+ }
+
+ private:
+ std::unique_ptr<WritableFile> target_;
+};
+
+class LegacyDirectoryWrapper : public FSDirectory {
+ public:
+ explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target)
+ : target_(std::move(target)) {}
+
+ IOStatus Fsync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Fsync());
+ }
+ IOStatus Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Close());
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ return target_->GetUniqueId(id, max_size);
+ }
+
+ private:
+ std::unique_ptr<Directory> target_;
+};
+
+class LegacyFileSystemWrapper : public FileSystem {
+ public:
+ // Initialize an EnvWrapper that delegates all calls to *t
+ explicit LegacyFileSystemWrapper(Env* t) : target_(t) {}
+ ~LegacyFileSystemWrapper() override {}
+
+ static const char* kClassName() { return "LegacyFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+
+ // Return the target to which this Env forwards all calls
+ Env* target() const { return target_; }
+
+ // The following text is boilerplate that forwards all methods to target()
+ IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<SequentialFile> file;
+ Status s = target_->NewSequentialFile(f, &file, file_opts);
+ if (s.ok()) {
+ r->reset(new LegacySequentialFileWrapper(std::move(file)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus NewRandomAccessFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* r,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<RandomAccessFile> file;
+ Status s = target_->NewRandomAccessFile(f, &file, file_opts);
+ if (s.ok()) {
+ r->reset(new LegacyRandomAccessFileWrapper(std::move(file)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<WritableFile> file;
+ Status s = target_->NewWritableFile(f, &file, file_opts);
+ if (s.ok()) {
+ r->reset(new LegacyWritableFileWrapper(std::move(file)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<WritableFile> file;
+ Status s = target_->ReopenWritableFile(fname, &file, file_opts);
+ if (s.ok()) {
+ result->reset(new LegacyWritableFileWrapper(std::move(file)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<WritableFile> file;
+ Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts);
+ if (s.ok()) {
+ r->reset(new LegacyWritableFileWrapper(std::move(file)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus NewRandomRWFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<RandomRWFile> file;
+ Status s = target_->NewRandomRWFile(fname, &file, file_opts);
+ if (s.ok()) {
+ result->reset(new LegacyRandomRWFileWrapper(std::move(file)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ return status_to_io_status(
+ target_->NewMemoryMappedFileBuffer(fname, result));
+ }
+ IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* /*dbg*/) override {
+ std::unique_ptr<Directory> dir;
+ Status s = target_->NewDirectory(name, &dir);
+ if (s.ok()) {
+ result->reset(new LegacyDirectoryWrapper(std::move(dir)));
+ }
+ return status_to_io_status(std::move(s));
+ }
+ IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->FileExists(f));
+ }
+ IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/,
+ std::vector<std::string>* r,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->GetChildren(dir, r));
+ }
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& /*options*/,
+ std::vector<FileAttributes>* result,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->GetChildrenFileAttributes(dir, result));
+ }
+ IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->DeleteFile(f));
+ }
+ IOStatus Truncate(const std::string& fname, size_t size,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->Truncate(fname, size));
+ }
+ IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->CreateDir(d));
+ }
+ IOStatus CreateDirIfMissing(const std::string& d,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->CreateDirIfMissing(d));
+ }
+ IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->DeleteDir(d));
+ }
+ IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+ uint64_t* s, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->GetFileSize(f, s));
+ }
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& /*options*/,
+ uint64_t* file_mtime,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(
+ target_->GetFileModificationTime(fname, file_mtime));
+ }
+
+ IOStatus GetAbsolutePath(const std::string& db_path,
+ const IOOptions& /*options*/,
+ std::string* output_path,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->GetAbsolutePath(db_path, output_path));
+ }
+
+ IOStatus RenameFile(const std::string& s, const std::string& t,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->RenameFile(s, t));
+ }
+
+ IOStatus LinkFile(const std::string& s, const std::string& t,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->LinkFile(s, t));
+ }
+
+ IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
+ uint64_t* count, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->NumFileLinks(fname, count));
+ }
+
+ IOStatus AreFilesSame(const std::string& first, const std::string& second,
+ const IOOptions& /*options*/, bool* res,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->AreFilesSame(first, second, res));
+ }
+
+ IOStatus LockFile(const std::string& f, const IOOptions& /*options*/,
+ FileLock** l, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->LockFile(f, l));
+ }
+
+ IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->UnlockFile(l));
+ }
+
+ IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->GetTestDirectory(path));
+ }
+ IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->NewLogger(fname, result));
+ }
+
+ void SanitizeFileOptions(FileOptions* opts) const override {
+ target_->SanitizeEnvOptions(opts);
+ }
+
+ FileOptions OptimizeForLogRead(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForLogRead(file_options);
+ }
+ FileOptions OptimizeForManifestRead(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForManifestRead(file_options);
+ }
+ FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const override {
+ return target_->OptimizeForLogWrite(file_options, db_options);
+ }
+ FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const override {
+ return target_->OptimizeForManifestWrite(file_options);
+ }
+ FileOptions OptimizeForCompactionTableWrite(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& immutable_ops) const override {
+ return target_->OptimizeForCompactionTableWrite(file_options,
+ immutable_ops);
+ }
+ FileOptions OptimizeForCompactionTableRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_->OptimizeForCompactionTableRead(file_options, db_options);
+ }
+ FileOptions OptimizeForBlobFileRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const override {
+ return target_->OptimizeForBlobFileRead(file_options, db_options);
+ }
+
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+ IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/,
+ uint64_t* diskfree, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->GetFreeSpace(path, diskfree));
+ }
+ IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/,
+ bool* is_dir, IODebugContext* /*dbg*/) override {
+ return status_to_io_status(target_->IsDirectory(path, is_dir));
+ }
+
+#ifndef ROCKSDB_LITE
+ std::string SerializeOptions(const ConfigOptions& /*config_options*/,
+ const std::string& /*prefix*/) const override {
+ // We do not want the LegacyFileSystem to appear in the serialized output.
+ // This clock is an internal class for those who do not implement one and
+ // would be part of the Env. As such, do not serialize it here.
+ return "";
+ }
+#endif // ROCKSDB_LITE
+ private:
+ Env* target_;
+};
+} // end anonymous namespace
+
+Env::Env() : thread_status_updater_(nullptr) {
+ file_system_ = std::make_shared<LegacyFileSystemWrapper>(this);
+ system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs)
+ : thread_status_updater_(nullptr), file_system_(fs) {
+ system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& clock)
+ : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {}
+
+Env::~Env() {}
+
+Status Env::NewLogger(const std::string& fname,
+ std::shared_ptr<Logger>* result) {
+ return NewEnvLogger(fname, this, result);
+}
+
+Status Env::LoadEnv(const std::string& value, Env** result) {
+ return CreateFromString(ConfigOptions(), value, result);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value, Env** result) {
+ Env* base = Env::Default();
+ if (value.empty() || base->IsInstanceOf(value)) {
+ *result = base;
+ return Status::OK();
+ } else {
+ RegisterSystemEnvs();
+ Env* env = *result;
+ Status s = LoadStaticObject<Env>(config_options, value, nullptr, &env);
+ if (s.ok()) {
+ *result = env;
+ }
+ return s;
+ }
+}
+
+Status Env::LoadEnv(const std::string& value, Env** result,
+ std::shared_ptr<Env>* guard) {
+ return CreateFromString(ConfigOptions(), value, result, guard);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value, Env** result,
+ std::shared_ptr<Env>* guard) {
+ assert(result);
+ assert(guard != nullptr);
+ std::unique_ptr<Env> uniq;
+
+ Env* env = *result;
+ std::string id;
+ std::unordered_map<std::string, std::string> opt_map;
+
+ Status status =
+ Customizable::GetOptionsMap(config_options, env, value, &id, &opt_map);
+ if (!status.ok()) { // GetOptionsMap failed
+ return status;
+ }
+ Env* base = Env::Default();
+ if (id.empty() || base->IsInstanceOf(id)) {
+ env = base;
+ status = Status::OK();
+ } else {
+ RegisterSystemEnvs();
+#ifndef ROCKSDB_LITE
+ // First, try to load the Env as a unique object.
+ status = config_options.registry->NewObject<Env>(id, &env, &uniq);
+#else
+ status =
+ Status::NotSupported("Cannot load environment in LITE mode", value);
+#endif
+ }
+ if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+ status = Status::OK();
+ } else if (status.ok()) {
+ status = Customizable::ConfigureNewObject(config_options, env, opt_map);
+ }
+ if (status.ok()) {
+ guard->reset(uniq.release());
+ *result = env;
+ }
+ return status;
+}
+
+Status Env::CreateFromUri(const ConfigOptions& config_options,
+ const std::string& env_uri, const std::string& fs_uri,
+ Env** result, std::shared_ptr<Env>* guard) {
+ *result = config_options.env;
+ if (env_uri.empty() && fs_uri.empty()) {
+ // Neither specified. Use the default
+ guard->reset();
+ return Status::OK();
+ } else if (!env_uri.empty() && !fs_uri.empty()) {
+ // Both specified. Cannot choose. Return Invalid
+ return Status::InvalidArgument("cannot specify both fs_uri and env_uri");
+ } else if (fs_uri.empty()) { // Only have an ENV URI. Create an Env from it
+ return CreateFromString(config_options, env_uri, result, guard);
+ } else {
+ std::shared_ptr<FileSystem> fs;
+ Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs);
+ if (s.ok()) {
+ guard->reset(new CompositeEnvWrapper(*result, fs));
+ *result = guard->get();
+ }
+ return s;
+ }
+}
+
+std::string Env::PriorityToString(Env::Priority priority) {
+ switch (priority) {
+ case Env::Priority::BOTTOM:
+ return "Bottom";
+ case Env::Priority::LOW:
+ return "Low";
+ case Env::Priority::HIGH:
+ return "High";
+ case Env::Priority::USER:
+ return "User";
+ case Env::Priority::TOTAL:
+ assert(false);
+ }
+ return "Invalid";
+}
+
+uint64_t Env::GetThreadID() const {
+ std::hash<std::thread::id> hasher;
+ return hasher(std::this_thread::get_id());
+}
+
+Status Env::ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& options) {
+ Status s = RenameFile(old_fname, fname);
+ if (!s.ok()) {
+ return s;
+ }
+ return NewWritableFile(fname, result, options);
+}
+
+Status Env::GetChildrenFileAttributes(const std::string& dir,
+ std::vector<FileAttributes>* result) {
+ assert(result != nullptr);
+ std::vector<std::string> child_fnames;
+ Status s = GetChildren(dir, &child_fnames);
+ if (!s.ok()) {
+ return s;
+ }
+ result->resize(child_fnames.size());
+ size_t result_size = 0;
+ for (size_t i = 0; i < child_fnames.size(); ++i) {
+ const std::string path = dir + "/" + child_fnames[i];
+ if (!(s = GetFileSize(path, &(*result)[result_size].size_bytes)).ok()) {
+ if (FileExists(path).IsNotFound()) {
+ // The file may have been deleted since we listed the directory
+ continue;
+ }
+ return s;
+ }
+ (*result)[result_size].name = std::move(child_fnames[i]);
+ result_size++;
+ }
+ result->resize(result_size);
+ return Status::OK();
+}
+
+Status Env::GetHostNameString(std::string* result) {
+ std::array<char, kMaxHostNameLen> hostname_buf{};
+ Status s = GetHostName(hostname_buf.data(), hostname_buf.size());
+ if (s.ok()) {
+ hostname_buf[hostname_buf.size() - 1] = '\0';
+ result->assign(hostname_buf.data());
+ }
+ return s;
+}
+
+std::string Env::GenerateUniqueId() {
+ std::string result;
+ bool success = port::GenerateRfcUuid(&result);
+ if (!success) {
+ // Fall back on our own way of generating a unique ID and adapt it to
+ // RFC 4122 variant 1 version 4 (a random ID).
+ // https://en.wikipedia.org/wiki/Universally_unique_identifier
+ // We already tried GenerateRfcUuid so no need to try it again in
+ // GenerateRawUniqueId
+ constexpr bool exclude_port_uuid = true;
+ uint64_t upper, lower;
+ GenerateRawUniqueId(&upper, &lower, exclude_port_uuid);
+
+ // Set 4-bit version to 4
+ upper = (upper & (~uint64_t{0xf000})) | 0x4000;
+ // Set unary-encoded variant to 1 (0b10)
+ lower = (lower & (~(uint64_t{3} << 62))) | (uint64_t{2} << 62);
+
+ // Use 36 character format of RFC 4122
+ result.resize(36U);
+ char* buf = &result[0];
+ PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false);
+ *(buf++) = '-';
+ PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false);
+ *(buf++) = '-';
+ PutBaseChars<16>(&buf, 4, upper, /*!uppercase*/ false);
+ *(buf++) = '-';
+ PutBaseChars<16>(&buf, 4, lower >> 48, /*!uppercase*/ false);
+ *(buf++) = '-';
+ PutBaseChars<16>(&buf, 12, lower, /*!uppercase*/ false);
+ assert(buf == &result[36]);
+
+ // Verify variant 1 version 4
+ assert(result[14] == '4');
+ assert(result[19] == '8' || result[19] == '9' || result[19] == 'a' ||
+ result[19] == 'b');
+ }
+ return result;
+}
+
+SequentialFile::~SequentialFile() {}
+
+RandomAccessFile::~RandomAccessFile() {}
+
+WritableFile::~WritableFile() {}
+
+MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {}
+
+Logger::~Logger() {}
+
+Status Logger::Close() {
+ if (!closed_) {
+ closed_ = true;
+ return CloseImpl();
+ } else {
+ return Status::OK();
+ }
+}
+
+Status Logger::CloseImpl() { return Status::NotSupported(); }
+
+FileLock::~FileLock() {}
+
+void LogFlush(Logger* info_log) {
+ if (info_log) {
+ info_log->Flush();
+ }
+}
+
+static void Logv(Logger* info_log, const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
+ info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+ }
+}
+
+void Log(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Logv(info_log, format, ap);
+ va_end(ap);
+}
+
+void Logger::Logv(const InfoLogLevel log_level, const char* format,
+ va_list ap) {
+ static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", "ERROR",
+ "FATAL"};
+ if (log_level < log_level_) {
+ return;
+ }
+
+ if (log_level == InfoLogLevel::INFO_LEVEL) {
+ // Doesn't print log level if it is INFO level.
+ // This is to avoid unexpected performance regression after we add
+ // the feature of log level. All the logs before we add the feature
+ // are INFO level. We don't want to add extra costs to those existing
+ // logging.
+ Logv(format, ap);
+ } else if (log_level == InfoLogLevel::HEADER_LEVEL) {
+ LogHeader(format, ap);
+ } else {
+ char new_format[500];
+ snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
+ kInfoLogLevelNames[log_level], format);
+ Logv(new_format, ap);
+ }
+
+ if (log_level >= InfoLogLevel::WARN_LEVEL &&
+ log_level != InfoLogLevel::HEADER_LEVEL) {
+ // Log messages with severity of warning or higher should be rare and are
+ // sometimes followed by an unclean crash. We want to be sure important
+ // messages are not lost in an application buffer when that happens.
+ Flush();
+ }
+}
+
+static void Logv(const InfoLogLevel log_level, Logger* info_log,
+ const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= log_level) {
+ if (log_level == InfoLogLevel::HEADER_LEVEL) {
+ info_log->LogHeader(format, ap);
+ } else {
+ info_log->Logv(log_level, format, ap);
+ }
+ }
+}
+
+void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
+ ...) {
+ va_list ap;
+ va_start(ap, format);
+ Logv(log_level, info_log, format, ap);
+ va_end(ap);
+}
+
+static void Headerv(Logger* info_log, const char* format, va_list ap) {
+ if (info_log) {
+ info_log->LogHeader(format, ap);
+ }
+}
+
+void Header(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Headerv(info_log, format, ap);
+ va_end(ap);
+}
+
+static void Debugv(Logger* info_log, const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
+ info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
+ }
+}
+
+void Debug(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Debugv(info_log, format, ap);
+ va_end(ap);
+}
+
+static void Infov(Logger* info_log, const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
+ info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+ }
+}
+
+void Info(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Infov(info_log, format, ap);
+ va_end(ap);
+}
+
+static void Warnv(Logger* info_log, const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) {
+ info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
+ }
+}
+
+void Warn(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Warnv(info_log, format, ap);
+ va_end(ap);
+}
+
+static void Errorv(Logger* info_log, const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) {
+ info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
+ }
+}
+
+void Error(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Errorv(info_log, format, ap);
+ va_end(ap);
+}
+
+static void Fatalv(Logger* info_log, const char* format, va_list ap) {
+ if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) {
+ info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
+ }
+}
+
+void Fatal(Logger* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Fatalv(info_log, format, ap);
+ va_end(ap);
+}
+
+void LogFlush(const std::shared_ptr<Logger>& info_log) {
+ LogFlush(info_log.get());
+}
+
+void Log(const InfoLogLevel log_level, const std::shared_ptr<Logger>& info_log,
+ const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Logv(log_level, info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Header(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Headerv(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Debug(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Debugv(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Info(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Infov(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Warn(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Warnv(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Error(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Errorv(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Fatal(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Fatalv(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+void Log(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ Logv(info_log.get(), format, ap);
+ va_end(ap);
+}
+
+Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
+ bool should_sync) {
+ const auto& fs = env->GetFileSystem();
+ return WriteStringToFile(fs.get(), data, fname, should_sync);
+}
+
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
+ const auto& fs = env->GetFileSystem();
+ return ReadFileToString(fs.get(), fname, data);
+}
+
+namespace { // anonymous namespace
+
+void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
+ env_options->use_mmap_reads = options.allow_mmap_reads;
+ env_options->use_mmap_writes = options.allow_mmap_writes;
+ env_options->use_direct_reads = options.use_direct_reads;
+ env_options->set_fd_cloexec = options.is_fd_close_on_exec;
+ env_options->bytes_per_sync = options.bytes_per_sync;
+ env_options->compaction_readahead_size = options.compaction_readahead_size;
+ env_options->random_access_max_buffer_size =
+ options.random_access_max_buffer_size;
+ env_options->rate_limiter = options.rate_limiter.get();
+ env_options->writable_file_max_buffer_size =
+ options.writable_file_max_buffer_size;
+ env_options->allow_fallocate = options.allow_fallocate;
+ env_options->strict_bytes_per_sync = options.strict_bytes_per_sync;
+ options.env->SanitizeEnvOptions(env_options);
+}
+
+} // namespace
+
+EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
+ const DBOptions& db_options) const {
+ EnvOptions optimized_env_options(env_options);
+ optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+ optimized_env_options.writable_file_max_buffer_size =
+ db_options.writable_file_max_buffer_size;
+ return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
+ return env_options;
+}
+
+EnvOptions Env::OptimizeForLogRead(const EnvOptions& env_options) const {
+ EnvOptions optimized_env_options(env_options);
+ optimized_env_options.use_direct_reads = false;
+ return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForManifestRead(const EnvOptions& env_options) const {
+ EnvOptions optimized_env_options(env_options);
+ optimized_env_options.use_direct_reads = false;
+ return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForCompactionTableWrite(
+ const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+ EnvOptions optimized_env_options(env_options);
+ optimized_env_options.use_direct_writes =
+ db_options.use_direct_io_for_flush_and_compaction;
+ return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForCompactionTableRead(
+ const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+ EnvOptions optimized_env_options(env_options);
+ optimized_env_options.use_direct_reads = db_options.use_direct_reads;
+ return optimized_env_options;
+}
+EnvOptions Env::OptimizeForBlobFileRead(
+ const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+ EnvOptions optimized_env_options(env_options);
+ optimized_env_options.use_direct_reads = db_options.use_direct_reads;
+ return optimized_env_options;
+}
+
+EnvOptions::EnvOptions(const DBOptions& options) {
+ AssignEnvOptions(this, options);
+}
+
+EnvOptions::EnvOptions() {
+ DBOptions options;
+ AssignEnvOptions(this, options);
+}
+
+Status NewEnvLogger(const std::string& fname, Env* env,
+ std::shared_ptr<Logger>* result) {
+ FileOptions options;
+ // TODO: Tune the buffer size.
+ options.writable_file_max_buffer_size = 1024 * 1024;
+ std::unique_ptr<FSWritableFile> writable_file;
+ const auto status = env->GetFileSystem()->NewWritableFile(
+ fname, options, &writable_file, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+
+ *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+ options, env);
+ return Status::OK();
+}
+
+const std::shared_ptr<FileSystem>& Env::GetFileSystem() const {
+ return file_system_;
+}
+
+const std::shared_ptr<SystemClock>& Env::GetSystemClock() const {
+ return system_clock_;
+}
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> sc_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+ {"target",
+ OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+ 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)},
+#endif // ROCKSDB_LITE
+};
+
+} // namespace
+SystemClockWrapper::SystemClockWrapper(const std::shared_ptr<SystemClock>& t)
+ : target_(t) {
+ RegisterOptions("", &target_, &sc_wrapper_type_info);
+}
+
+Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) {
+ if (target_ == nullptr) {
+ target_ = SystemClock::Default();
+ }
+ return SystemClock::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string SystemClockWrapper::SerializeOptions(
+ const ConfigOptions& config_options, const std::string& header) const {
+ auto parent = SystemClock::SerializeOptions(config_options, "");
+ if (config_options.IsShallow() || target_ == nullptr ||
+ target_->IsInstanceOf(SystemClock::kDefaultName())) {
+ return parent;
+ } else {
+ std::string result = header;
+ if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+ result.append(OptionTypeInfo::kIdPropName()).append("=");
+ }
+ result.append(parent);
+ if (!EndsWith(result, config_options.delimiter)) {
+ result.append(config_options.delimiter);
+ }
+ result.append("target=").append(target_->ToString(config_options));
+ return result;
+ }
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinSystemClocks(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<SystemClock>(
+ EmulatedSystemClock::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new EmulatedSystemClock(SystemClock::Default()));
+ return guard->get();
+ });
+ size_t num_types;
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif // ROCKSDB_LITE
+
+Status SystemClock::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<SystemClock>* result) {
+ auto clock = SystemClock::Default();
+ if (clock->IsInstanceOf(value)) {
+ *result = clock;
+ return Status::OK();
+ } else {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ return LoadSharedObject<SystemClock>(config_options, value, nullptr,
+ result);
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/env_basic_test.cc b/src/rocksdb/env/env_basic_test.cc
new file mode 100644
index 000000000..0f18b3218
--- /dev/null
+++ b/src/rocksdb/env/env_basic_test.cc
@@ -0,0 +1,401 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "env/mock_env.h"
+#include "file/file_util.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/env_encryption.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+using CreateEnvFunc = Env*();
+
+// These functions are used to create the various environments under which this
+// test can execute. These functions are used to allow the test cases to be
+// created without the Env being initialized, thereby eliminating a potential
+// static initialization fiasco/race condition when attempting to get a
+// custom/configured env prior to main being invoked.
+
+static Env* GetDefaultEnv() { return Env::Default(); }
+
+static Env* GetMockEnv() {
+ static std::unique_ptr<Env> mock_env(MockEnv::Create(Env::Default()));
+ return mock_env.get();
+}
+#ifndef ROCKSDB_LITE
+static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) {
+ ConfigOptions config_opts;
+ config_opts.invoke_prepare_options = false;
+
+ std::shared_ptr<EncryptionProvider> provider;
+ EXPECT_OK(EncryptionProvider::CreateFromString(config_opts, provider_id,
+ &provider));
+ return NewEncryptedEnv(base, provider);
+}
+
+static Env* GetCtrEncryptedEnv() {
+ static std::unique_ptr<Env> ctr_encrypt_env(
+ NewTestEncryptedEnv(Env::Default(), "CTR://test"));
+ return ctr_encrypt_env.get();
+}
+
+static Env* GetMemoryEnv() {
+ static std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
+ return mem_env.get();
+}
+
+static Env* GetTestEnv() {
+ static std::shared_ptr<Env> env_guard;
+ static Env* custom_env = nullptr;
+ if (custom_env == nullptr) {
+ const char* uri = getenv("TEST_ENV_URI");
+ if (uri != nullptr) {
+ EXPECT_OK(Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env,
+ &env_guard));
+ }
+ }
+ EXPECT_NE(custom_env, nullptr);
+ return custom_env;
+}
+
+static Env* GetTestFS() {
+ static std::shared_ptr<Env> fs_env_guard;
+ static Env* fs_env = nullptr;
+ if (fs_env == nullptr) {
+ const char* uri = getenv("TEST_FS_URI");
+ if (uri != nullptr) {
+ EXPECT_OK(
+ Env::CreateFromUri(ConfigOptions(), uri, "", &fs_env, &fs_env_guard));
+ }
+ }
+ EXPECT_NE(fs_env, nullptr);
+ return fs_env;
+}
+#endif // ROCKSDB_LITE
+
+} // namespace
+class EnvBasicTestWithParam
+ : public testing::Test,
+ public ::testing::WithParamInterface<CreateEnvFunc*> {
+ public:
+ Env* env_;
+ const EnvOptions soptions_;
+ std::string test_dir_;
+
+ EnvBasicTestWithParam() : env_(GetParam()()) {
+ test_dir_ = test::PerThreadDBPath(env_, "env_basic_test");
+ }
+
+ void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); }
+
+ void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); }
+};
+
+class EnvMoreTestWithParam : public EnvBasicTestWithParam {};
+
+INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam,
+ ::testing::Values(&GetDefaultEnv));
+INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam,
+ ::testing::Values(&GetDefaultEnv));
+
+INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam,
+ ::testing::Values(&GetMockEnv));
+
+#ifndef ROCKSDB_LITE
+// next statements run env test against default encryption code.
+INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam,
+ ::testing::Values(&GetCtrEncryptedEnv));
+INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam,
+ ::testing::Values(&GetCtrEncryptedEnv));
+
+INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam,
+ ::testing::Values(&GetMemoryEnv));
+
+namespace {
+
+// Returns a vector of 0 or 1 Env*, depending whether an Env is registered for
+// TEST_ENV_URI.
+//
+// The purpose of returning an empty vector (instead of nullptr) is that gtest
+// ValuesIn() will skip running tests when given an empty collection.
+std::vector<CreateEnvFunc*> GetCustomEnvs() {
+ std::vector<CreateEnvFunc*> res;
+ const char* uri = getenv("TEST_ENV_URI");
+ if (uri != nullptr) {
+ res.push_back(&GetTestEnv);
+ }
+ uri = getenv("TEST_FS_URI");
+ if (uri != nullptr) {
+ res.push_back(&GetTestFS);
+ }
+ return res;
+}
+
+} // anonymous namespace
+
+INSTANTIATE_TEST_CASE_P(CustomEnv, EnvBasicTestWithParam,
+ ::testing::ValuesIn(GetCustomEnvs()));
+
+INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam,
+ ::testing::ValuesIn(GetCustomEnvs()));
+#endif // ROCKSDB_LITE
+
+TEST_P(EnvBasicTestWithParam, Basics) {
+ uint64_t file_size;
+ std::unique_ptr<WritableFile> writable_file;
+ std::vector<std::string> children;
+
+ // Check that the directory is empty.
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/non_existent"));
+ ASSERT_TRUE(!env_->GetFileSize(test_dir_ + "/non_existent", &file_size).ok());
+ ASSERT_OK(env_->GetChildren(test_dir_, &children));
+ ASSERT_EQ(0U, children.size());
+
+ // Create a file.
+ ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+
+ // Check that the file exists.
+ ASSERT_OK(env_->FileExists(test_dir_ + "/f"));
+ ASSERT_OK(env_->GetFileSize(test_dir_ + "/f", &file_size));
+ ASSERT_EQ(0U, file_size);
+ ASSERT_OK(env_->GetChildren(test_dir_, &children));
+ ASSERT_EQ(1U, children.size());
+ ASSERT_EQ("f", children[0]);
+ ASSERT_OK(env_->DeleteFile(test_dir_ + "/f"));
+
+ // Write to the file.
+ ASSERT_OK(
+ env_->NewWritableFile(test_dir_ + "/f1", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append("abc"));
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+ ASSERT_OK(
+ env_->NewWritableFile(test_dir_ + "/f2", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+
+ // Check for expected size.
+ ASSERT_OK(env_->GetFileSize(test_dir_ + "/f1", &file_size));
+ ASSERT_EQ(3U, file_size);
+
+ // Check that renaming works.
+ ASSERT_TRUE(
+ !env_->RenameFile(test_dir_ + "/non_existent", test_dir_ + "/g").ok());
+ ASSERT_OK(env_->RenameFile(test_dir_ + "/f1", test_dir_ + "/g"));
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/f1"));
+ ASSERT_OK(env_->FileExists(test_dir_ + "/g"));
+ ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size));
+ ASSERT_EQ(3U, file_size);
+
+ // Check that renaming overwriting works
+ ASSERT_OK(env_->RenameFile(test_dir_ + "/f2", test_dir_ + "/g"));
+ ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size));
+ ASSERT_EQ(0U, file_size);
+
+ // Check that opening non-existent file fails.
+ std::unique_ptr<SequentialFile> seq_file;
+ std::unique_ptr<RandomAccessFile> rand_file;
+ ASSERT_TRUE(!env_->NewSequentialFile(test_dir_ + "/non_existent", &seq_file,
+ soptions_)
+ .ok());
+ ASSERT_TRUE(!seq_file);
+ ASSERT_NOK(env_->NewRandomAccessFile(test_dir_ + "/non_existent", &rand_file,
+ soptions_));
+ ASSERT_TRUE(!rand_file);
+
+ // Check that deleting works.
+ ASSERT_NOK(env_->DeleteFile(test_dir_ + "/non_existent"));
+ ASSERT_OK(env_->DeleteFile(test_dir_ + "/g"));
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g"));
+ ASSERT_OK(env_->GetChildren(test_dir_, &children));
+ ASSERT_EQ(0U, children.size());
+ Status s = env_->GetChildren(test_dir_ + "/non_existent", &children);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(EnvBasicTestWithParam, ReadWrite) {
+ std::unique_ptr<WritableFile> writable_file;
+ std::unique_ptr<SequentialFile> seq_file;
+ std::unique_ptr<RandomAccessFile> rand_file;
+ Slice result;
+ char scratch[100];
+
+ ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append("hello "));
+ ASSERT_OK(writable_file->Append("world"));
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+
+ // Read sequentially.
+ ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
+ ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+ ASSERT_EQ(0, result.compare("hello"));
+ ASSERT_OK(seq_file->Skip(1));
+ ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+ ASSERT_EQ(0, result.compare("world"));
+ ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+ ASSERT_EQ(0U, result.size());
+ ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+ ASSERT_OK(seq_file->Read(1000, &result, scratch));
+ ASSERT_EQ(0U, result.size());
+
+ // Random reads.
+ ASSERT_OK(env_->NewRandomAccessFile(test_dir_ + "/f", &rand_file, soptions_));
+ ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+ ASSERT_EQ(0, result.compare("world"));
+ ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+ ASSERT_EQ(0, result.compare("hello"));
+ ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+ ASSERT_EQ(0, result.compare("d"));
+
+ // Too high offset.
+ ASSERT_TRUE(rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST_P(EnvBasicTestWithParam, Misc) {
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env_->NewWritableFile(test_dir_ + "/b", &writable_file, soptions_));
+
+ // These are no-ops, but we test they return success.
+ ASSERT_OK(writable_file->Sync());
+ ASSERT_OK(writable_file->Flush());
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+}
+
+TEST_P(EnvBasicTestWithParam, LargeWrite) {
+ const size_t kWriteSize = 300 * 1024;
+ char* scratch = new char[kWriteSize * 2];
+
+ std::string write_data;
+ for (size_t i = 0; i < kWriteSize; ++i) {
+ write_data.append(1, static_cast<char>(i));
+ }
+
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append("foo"));
+ ASSERT_OK(writable_file->Append(write_data));
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+
+ std::unique_ptr<SequentialFile> seq_file;
+ Slice result;
+ ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
+ ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+ ASSERT_EQ(0, result.compare("foo"));
+
+ size_t read = 0;
+ std::string read_data;
+ while (read < kWriteSize) {
+ ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+ read_data.append(result.data(), result.size());
+ read += result.size();
+ }
+ ASSERT_TRUE(write_data == read_data);
+ delete[] scratch;
+}
+
+TEST_P(EnvMoreTestWithParam, GetModTime) {
+ ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/dir1"));
+ uint64_t mtime1 = 0x0;
+ ASSERT_OK(env_->GetFileModificationTime(test_dir_ + "/dir1", &mtime1));
+}
+
+TEST_P(EnvMoreTestWithParam, MakeDir) {
+ ASSERT_OK(env_->CreateDir(test_dir_ + "/j"));
+ ASSERT_OK(env_->FileExists(test_dir_ + "/j"));
+ std::vector<std::string> children;
+ ASSERT_OK(env_->GetChildren(test_dir_, &children));
+ ASSERT_EQ(1U, children.size());
+ // fail because file already exists
+ ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok());
+ ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/j"));
+ ASSERT_OK(env_->DeleteDir(test_dir_ + "/j"));
+ ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/j"));
+}
+
+TEST_P(EnvMoreTestWithParam, GetChildren) {
+ // empty folder returns empty vector
+ std::vector<std::string> children;
+ std::vector<Env::FileAttributes> childAttr;
+ ASSERT_OK(env_->CreateDirIfMissing(test_dir_));
+ ASSERT_OK(env_->GetChildren(test_dir_, &children));
+ ASSERT_OK(env_->FileExists(test_dir_));
+ ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
+ ASSERT_EQ(0U, children.size());
+ ASSERT_EQ(0U, childAttr.size());
+
+ // folder with contents returns relative path to test dir
+ ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/niu"));
+ ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/you"));
+ ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/guo"));
+ ASSERT_OK(env_->GetChildren(test_dir_, &children));
+ ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
+ ASSERT_EQ(3U, children.size());
+ ASSERT_EQ(3U, childAttr.size());
+ for (auto each : children) {
+ env_->DeleteDir(test_dir_ + "/" + each).PermitUncheckedError();
+ } // necessary for default POSIX env
+
+ // non-exist directory returns IOError
+ ASSERT_OK(env_->DeleteDir(test_dir_));
+ ASSERT_NOK(env_->FileExists(test_dir_));
+ ASSERT_NOK(env_->GetChildren(test_dir_, &children));
+ ASSERT_NOK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
+
+ // if dir is a file, returns IOError
+ ASSERT_OK(env_->CreateDir(test_dir_));
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(
+ env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+ ASSERT_NOK(env_->GetChildren(test_dir_ + "/file", &children));
+ ASSERT_EQ(0U, children.size());
+}
+
+TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) {
+ auto* env = Env::Default();
+ ASSERT_OK(env->CreateDirIfMissing(test_dir_));
+
+ // Create a single file
+ std::string path = test_dir_;
+ const EnvOptions soptions;
+#ifdef OS_WIN
+ path.append("\\test_file");
+#else
+ path.append("/test_file");
+#endif
+ std::string data("test data");
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(env->NewWritableFile(path, &file, soptions));
+ ASSERT_OK(file->Append("test data"));
+
+ // get the children
+ std::vector<std::string> result;
+ ASSERT_OK(env->GetChildren(test_dir_, &result));
+
+ // expect only one file named `test_data`, i.e. no `.` or `..` names
+ ASSERT_EQ(result.size(), 1);
+ ASSERT_EQ(result.at(0), "test_file");
+}
+
+} // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/env_chroot.cc b/src/rocksdb/env/env_chroot.cc
new file mode 100644
index 000000000..a64373517
--- /dev/null
+++ b/src/rocksdb/env/env_chroot.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+#include "env/env_chroot.h"
+
+#include <errno.h> // errno
+#include <stdlib.h> // realpath, free
+#include <unistd.h> // geteuid
+
+#include "env/composite_env_wrapper.h"
+#include "env/fs_remap.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h" // errnoStr
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> chroot_fs_type_info = {
+ {"chroot_dir", {0, OptionType::kString}}};
+} // namespace
+ChrootFileSystem::ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+ const std::string& chroot_dir)
+ : RemapFileSystem(base), chroot_dir_(chroot_dir) {
+ RegisterOptions("chroot_dir", &chroot_dir_, &chroot_fs_type_info);
+}
+
+Status ChrootFileSystem::PrepareOptions(const ConfigOptions& options) {
+ Status s = FileSystemWrapper::PrepareOptions(options);
+ if (!s.ok()) {
+ return s;
+ } else if (chroot_dir_.empty()) {
+ s = Status::InvalidArgument("ChRootFileSystem requires a chroot dir");
+ } else {
+ s = target_->FileExists(chroot_dir_, IOOptions(), nullptr);
+ }
+ if (s.ok()) {
+#if defined(OS_AIX)
+ char resolvedName[PATH_MAX];
+ char* real_chroot_dir = realpath(chroot_dir_.c_str(), resolvedName);
+#else
+ char* real_chroot_dir = realpath(chroot_dir_.c_str(), nullptr);
+#endif
+ // chroot_dir must exist so realpath() returns non-nullptr.
+ assert(real_chroot_dir != nullptr);
+ chroot_dir_ = real_chroot_dir;
+#if !defined(OS_AIX)
+ free(real_chroot_dir);
+#endif
+ }
+ return s;
+}
+
+IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options,
+ std::string* path,
+ IODebugContext* dbg) {
+ // Adapted from PosixEnv's implementation since it doesn't provide a way to
+ // create directory in the chroot.
+ char buf[256];
+ snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast<int>(geteuid()));
+ *path = buf;
+
+ // Directory may already exist, so ignore return
+ return CreateDirIfMissing(*path, options, dbg);
+}
+
+// Returns status and expanded absolute path including the chroot directory.
+// Checks whether the provided path breaks out of the chroot. If it returns
+// non-OK status, the returned path should not be used.
+std::pair<IOStatus, std::string> ChrootFileSystem::EncodePath(
+ const std::string& path) {
+ if (path.empty() || path[0] != '/') {
+ return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+ }
+ std::pair<IOStatus, std::string> res;
+ res.second = chroot_dir_ + path;
+#if defined(OS_AIX)
+ char resolvedName[PATH_MAX];
+ char* normalized_path = realpath(res.second.c_str(), resolvedName);
+#else
+ char* normalized_path = realpath(res.second.c_str(), nullptr);
+#endif
+ if (normalized_path == nullptr) {
+ res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str());
+ } else if (strlen(normalized_path) < chroot_dir_.size() ||
+ strncmp(normalized_path, chroot_dir_.c_str(),
+ chroot_dir_.size()) != 0) {
+ res.first = IOStatus::IOError(res.second,
+ "Attempted to access path outside chroot");
+ } else {
+ res.first = IOStatus::OK();
+ }
+#if !defined(OS_AIX)
+ free(normalized_path);
+#endif
+ return res;
+}
+
+// Similar to EncodePath() except assumes the basename in the path hasn't been
+// created yet.
+std::pair<IOStatus, std::string> ChrootFileSystem::EncodePathWithNewBasename(
+ const std::string& path) {
+ if (path.empty() || path[0] != '/') {
+ return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+ }
+ // Basename may be followed by trailing slashes
+ size_t final_idx = path.find_last_not_of('/');
+ if (final_idx == std::string::npos) {
+ // It's only slashes so no basename to extract
+ return EncodePath(path);
+ }
+
+ // Pull off the basename temporarily since realname(3) (used by
+ // EncodePath()) requires a path that exists
+ size_t base_sep = path.rfind('/', final_idx);
+ auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1));
+ status_and_enc_path.second.append(path.substr(base_sep + 1));
+ return status_and_enc_path;
+}
+
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+ const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir) {
+ auto chroot_fs = std::make_shared<ChrootFileSystem>(base, chroot_dir);
+ Status s = chroot_fs->PrepareOptions(ConfigOptions());
+ if (s.ok()) {
+ return chroot_fs;
+ } else {
+ return nullptr;
+ }
+}
+
+Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) {
+ if (!base_env->FileExists(chroot_dir).ok()) {
+ return nullptr;
+ }
+ auto chroot_fs = NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir);
+ if (chroot_fs != nullptr) {
+ return new CompositeEnvWrapper(base_env, chroot_fs);
+ } else {
+ return nullptr;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/src/rocksdb/env/env_chroot.h b/src/rocksdb/env/env_chroot.h
new file mode 100644
index 000000000..9e5b9a1e9
--- /dev/null
+++ b/src/rocksdb/env/env_chroot.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+#include <string>
+
+#include "env/fs_remap.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ChrootFileSystem : public RemapFileSystem {
+ public:
+ ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+ const std::string& chroot_dir);
+
+ static const char* kClassName() { return "ChrootFS"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) override;
+
+ Status PrepareOptions(const ConfigOptions& options) override;
+
+ protected:
+ // Returns status and expanded absolute path including the chroot directory.
+ // Checks whether the provided path breaks out of the chroot. If it returns
+ // non-OK status, the returned path should not be used.
+ std::pair<IOStatus, std::string> EncodePath(const std::string& path) override;
+
+ // Similar to EncodePath() except assumes the basename in the path hasn't been
+ // created yet.
+ std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+ const std::string& path) override;
+
+ private:
+ std::string chroot_dir_;
+};
+
+// Returns an Env that translates paths such that the root directory appears to
+// be chroot_dir. chroot_dir should refer to an existing directory.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir);
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+ const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir);
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/src/rocksdb/env/env_encryption.cc b/src/rocksdb/env/env_encryption.cc
new file mode 100644
index 000000000..c6b0a257d
--- /dev/null
+++ b/src/rocksdb/env/env_encryption.cc
@@ -0,0 +1,1351 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/env_encryption.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <iostream>
+
+#include "env/composite_env_wrapper.h"
+#include "env/env_encryption_ctr.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/aligned_buffer.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+#endif
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+std::shared_ptr<EncryptionProvider> EncryptionProvider::NewCTRProvider(
+ const std::shared_ptr<BlockCipher>& cipher) {
+ return std::make_shared<CTREncryptionProvider>(cipher);
+}
+
+// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+// written by this routine. Sets "*result" to the data that was
+// read (including if fewer than "n" bytes were successfully read).
+// May set "*result" to point at data in "scratch[0..n-1]", so
+// "scratch[0..n-1]" must be live when "*result" is used.
+// If an error was encountered, returns a non-OK status.
+//
+// REQUIRES: External synchronization
+IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) {
+ assert(scratch);
+ IOStatus io_s = file_->Read(n, options, result, scratch, dbg);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ {
+ PERF_TIMER_GUARD(decrypt_data_nanos);
+ io_s = status_to_io_status(
+ stream_->Decrypt(offset_, (char*)result->data(), result->size()));
+ }
+ if (io_s.ok()) {
+ offset_ += result->size(); // We've already ready data from disk, so update
+ // offset_ even if decryption fails.
+ }
+ return io_s;
+}
+
+// Skip "n" bytes from the file. This is guaranteed to be no
+// slower that reading the same data, but may be faster.
+//
+// If end of file is reached, skipping will stop at the end of the
+// file, and Skip will return OK.
+//
+// REQUIRES: External synchronization
+IOStatus EncryptedSequentialFile::Skip(uint64_t n) {
+ auto status = file_->Skip(n);
+ if (!status.ok()) {
+ return status;
+ }
+ offset_ += n;
+ return status;
+}
+
+// Indicates the upper layers if the current SequentialFile implementation
+// uses direct IO.
+bool EncryptedSequentialFile::use_direct_io() const {
+ return file_->use_direct_io();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const {
+ return file_->GetRequiredBufferAlignment();
+}
+
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset,
+ size_t length) {
+ return file_->InvalidateCache(offset + prefixLength_, length);
+}
+
+// Positioned Read for direct I/O
+// If Direct I/O enabled, offset, n, and scratch should be properly aligned
+IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) {
+ assert(scratch);
+ offset += prefixLength_; // Skip prefix
+ auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ offset_ = offset + result->size();
+ {
+ PERF_TIMER_GUARD(decrypt_data_nanos);
+ io_s = status_to_io_status(
+ stream_->Decrypt(offset, (char*)result->data(), result->size()));
+ }
+ return io_s;
+}
+
+// Read up to "n" bytes from the file starting at "offset".
+// "scratch[0..n-1]" may be written by this routine. Sets "*result"
+// to the data that was read (including if fewer than "n" bytes were
+// successfully read). May set "*result" to point at data in
+// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+// "*result" is used. If an error was encountered, returns a non-OK
+// status.
+//
+// Safe for concurrent use by multiple threads.
+// If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n,
+ const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const {
+ assert(scratch);
+ offset += prefixLength_;
+ auto io_s = file_->Read(offset, n, options, result, scratch, dbg);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ {
+ PERF_TIMER_GUARD(decrypt_data_nanos);
+ io_s = status_to_io_status(
+ stream_->Decrypt(offset, (char*)result->data(), result->size()));
+ }
+ return io_s;
+}
+
+// Readahead the file starting from offset by n bytes for caching.
+IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ // return Status::OK();
+ return file_->Prefetch(offset + prefixLength_, n, options, dbg);
+}
+
+// Tries to get an unique ID for this file that will be the same each time
+// the file is opened (and will stay the same while the file is open).
+// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+// ID can be created this function returns the length of the ID and places it
+// in "id"; otherwise, this function returns 0, in which case "id"
+// may not have been modified.
+//
+// This function guarantees, for IDs from a given environment, two unique ids
+// cannot be made equal to each other by adding arbitrary bytes to one of
+// them. That is, no unique ID is the prefix of another.
+//
+// This function guarantees that the returned ID will not be interpretable as
+// a single varint.
+//
+// Note: these IDs are only valid for the duration of the process.
+size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+ return file_->GetUniqueId(id, max_size);
+};
+
+void EncryptedRandomAccessFile::Hint(AccessPattern pattern) {
+ file_->Hint(pattern);
+}
+
+// Indicates the upper layers if the current RandomAccessFile implementation
+// uses direct IO.
+bool EncryptedRandomAccessFile::use_direct_io() const {
+ return file_->use_direct_io();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const {
+ return file_->GetRequiredBufferAlignment();
+}
+
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset,
+ size_t length) {
+ return file_->InvalidateCache(offset + prefixLength_, length);
+}
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+IOStatus EncryptedWritableFile::Append(const Slice& data,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ AlignedBuffer buf;
+ Slice dataToAppend(data);
+ if (data.size() > 0) {
+ auto offset = file_->GetFileSize(options, dbg); // size including prefix
+ // Encrypt in cloned buffer
+ buf.Alignment(GetRequiredBufferAlignment());
+ buf.AllocateNewBuffer(data.size());
+ // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove
+ // so that the next two lines can be replaced with buf.Append().
+ memmove(buf.BufferStart(), data.data(), data.size());
+ buf.Size(data.size());
+ IOStatus io_s;
+ {
+ PERF_TIMER_GUARD(encrypt_data_nanos);
+ io_s = status_to_io_status(
+ stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
+ }
+ return file_->Append(dataToAppend, options, dbg);
+}
+
+IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data,
+ uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ AlignedBuffer buf;
+ Slice dataToAppend(data);
+ offset += prefixLength_;
+ if (data.size() > 0) {
+ // Encrypt in cloned buffer
+ buf.Alignment(GetRequiredBufferAlignment());
+ buf.AllocateNewBuffer(data.size());
+ memmove(buf.BufferStart(), data.data(), data.size());
+ buf.Size(data.size());
+ IOStatus io_s;
+ {
+ PERF_TIMER_GUARD(encrypt_data_nanos);
+ io_s = status_to_io_status(
+ stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
+ }
+ return file_->PositionedAppend(dataToAppend, offset, options, dbg);
+}
+
+// Indicates the upper layers if the current WritableFile implementation
+// uses direct IO.
+bool EncryptedWritableFile::use_direct_io() const {
+ return file_->use_direct_io();
+}
+
+// true if Sync() and Fsync() are safe to call concurrently with Append()
+// and Flush().
+bool EncryptedWritableFile::IsSyncThreadSafe() const {
+ return file_->IsSyncThreadSafe();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedWritableFile::GetRequiredBufferAlignment() const {
+ return file_->GetRequiredBufferAlignment();
+}
+
+/*
+ * Get the size of valid data in the file.
+ */
+uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->GetFileSize(options, dbg) - prefixLength_;
+}
+
+// Truncate is necessary to trim the file to the correct size
+// before closing. It is not always possible to keep track of the file
+// size due to whole pages writes. The behavior is undefined if called
+// with other writes to follow.
+IOStatus EncryptedWritableFile::Truncate(uint64_t size,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Truncate(size + prefixLength_, options, dbg);
+}
+
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+// This call has no effect on dirty pages in the cache.
+IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) {
+ return file_->InvalidateCache(offset + prefixLength_, length);
+}
+
+// Sync a file range with disk.
+// offset is the starting byte of the file range to be synchronized.
+// nbytes specifies the length of the range to be synchronized.
+// This asks the OS to initiate flushing the cached data to disk,
+// without waiting for completion.
+// Default implementation does nothing.
+IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg);
+}
+
+// PrepareWrite performs any necessary preparation for a write
+// before the write actually occurs. This allows for pre-allocation
+// of space on devices where it can result in less file
+// fragmentation and/or less waste from over-zealous filesystem
+// pre-allocation.
+void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ file_->PrepareWrite(offset + prefixLength_, len, options, dbg);
+}
+
+void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) {
+ // the size here doesn't need to include prefixLength_, as it's a
+ // configuration will be use for `PrepareWrite()`.
+ file_->SetPreallocationBlockSize(size);
+}
+
+void EncryptedWritableFile::GetPreallocationStatus(
+ size_t* block_size, size_t* last_allocated_block) {
+ file_->GetPreallocationStatus(block_size, last_allocated_block);
+}
+
+// Pre-allocates space for a file.
+IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Allocate(offset + prefixLength_, len, options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Flush(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Sync(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Sync(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Close(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Close(options, dbg);
+}
+
+// A file abstraction for random reading and writing.
+
+// Indicates if the class makes use of direct I/O
+// If false you must pass aligned buffer to Write()
+bool EncryptedRandomRWFile::use_direct_io() const {
+ return file_->use_direct_io();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const {
+ return file_->GetRequiredBufferAlignment();
+}
+
+// Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+// Pass aligned buffer when use_direct_io() returns true.
+IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ AlignedBuffer buf;
+ Slice dataToWrite(data);
+ offset += prefixLength_;
+ if (data.size() > 0) {
+ // Encrypt in cloned buffer
+ buf.Alignment(GetRequiredBufferAlignment());
+ buf.AllocateNewBuffer(data.size());
+ memmove(buf.BufferStart(), data.data(), data.size());
+ buf.Size(data.size());
+ IOStatus io_s;
+ {
+ PERF_TIMER_GUARD(encrypt_data_nanos);
+ io_s = status_to_io_status(
+ stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
+ }
+ return file_->Write(offset, dataToWrite, options, dbg);
+}
+
+// Read up to `n` bytes starting from offset `offset` and store them in
+// result, provided `scratch` size should be at least `n`.
+// Returns Status::OK() on success.
+IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n,
+ const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) const {
+ assert(scratch);
+ offset += prefixLength_;
+ auto status = file_->Read(offset, n, options, result, scratch, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ {
+ PERF_TIMER_GUARD(decrypt_data_nanos);
+ status = status_to_io_status(
+ stream_->Decrypt(offset, (char*)result->data(), result->size()));
+ }
+ return status;
+}
+
+IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Sync(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Fsync(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Close(const IOOptions& options,
+ IODebugContext* dbg) {
+ return file_->Close(options, dbg);
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> encrypted_fs_type_info =
+ {
+ {"provider",
+ OptionTypeInfo::AsCustomSharedPtr<EncryptionProvider>(
+ 0 /* No offset, whole struct*/, OptionVerificationType::kByName,
+ OptionTypeFlags::kNone)},
+};
+// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption
+// to files stored on disk.
+class EncryptedFileSystemImpl : public EncryptedFileSystem {
+ public:
+ const char* Name() const override {
+ return EncryptedFileSystem::kClassName();
+ }
+ // Returns the raw encryption provider that should be used to write the input
+ // encrypted file. If there is no such provider, NotFound is returned.
+ IOStatus GetWritableProvider(const std::string& /*fname*/,
+ EncryptionProvider** result) {
+ if (provider_) {
+ *result = provider_.get();
+ return IOStatus::OK();
+ } else {
+ *result = nullptr;
+ return IOStatus::NotFound("No WriteProvider specified");
+ }
+ }
+
+ // Returns the raw encryption provider that should be used to read the input
+ // encrypted file. If there is no such provider, NotFound is returned.
+ IOStatus GetReadableProvider(const std::string& /*fname*/,
+ EncryptionProvider** result) {
+ if (provider_) {
+ *result = provider_.get();
+ return IOStatus::OK();
+ } else {
+ *result = nullptr;
+ return IOStatus::NotFound("No Provider specified");
+ }
+ }
+
+ // Creates a CipherStream for the underlying file/name using the options
+ // If a writable provider is found and encryption is enabled, uses
+ // this provider to create a cipher stream.
+ // @param fname Name of the writable file
+ // @param underlying The underlying "raw" file
+ // @param options Options for creating the file/cipher
+ // @param prefix_length Returns the length of the encryption prefix used for
+ // this file
+ // @param stream Returns the cipher stream to use for this file if it
+ // should be encrypted
+ // @return OK on success, non-OK on failure.
+ template <class TypeFile>
+ IOStatus CreateWritableCipherStream(
+ const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+ const FileOptions& options, size_t* prefix_length,
+ std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+ EncryptionProvider* provider = nullptr;
+ *prefix_length = 0;
+ IOStatus status = GetWritableProvider(fname, &provider);
+ if (!status.ok()) {
+ return status;
+ } else if (provider != nullptr) {
+ // Initialize & write prefix (if needed)
+ AlignedBuffer buffer;
+ Slice prefix;
+ *prefix_length = provider->GetPrefixLength();
+ if (*prefix_length > 0) {
+ // Initialize prefix
+ buffer.Alignment(underlying->GetRequiredBufferAlignment());
+ buffer.AllocateNewBuffer(*prefix_length);
+ status = status_to_io_status(provider->CreateNewPrefix(
+ fname, buffer.BufferStart(), *prefix_length));
+ if (status.ok()) {
+ buffer.Size(*prefix_length);
+ prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
+ // Write prefix
+ status = underlying->Append(prefix, options.io_options, dbg);
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ // Create cipher stream
+ status = status_to_io_status(
+ provider->CreateCipherStream(fname, options, prefix, stream));
+ }
+ return status;
+ }
+
+ template <class TypeFile>
+ IOStatus CreateWritableEncryptedFile(const std::string& fname,
+ std::unique_ptr<TypeFile>& underlying,
+ const FileOptions& options,
+ std::unique_ptr<TypeFile>* result,
+ IODebugContext* dbg) {
+ // Create cipher stream
+ std::unique_ptr<BlockAccessCipherStream> stream;
+ size_t prefix_length;
+ IOStatus status = CreateWritableCipherStream(fname, underlying, options,
+ &prefix_length, &stream, dbg);
+ if (status.ok()) {
+ if (stream) {
+ result->reset(new EncryptedWritableFile(
+ std::move(underlying), std::move(stream), prefix_length));
+ } else {
+ result->reset(underlying.release());
+ }
+ }
+ return status;
+ }
+
+ // Creates a CipherStream for the underlying file/name using the options
+ // If a writable provider is found and encryption is enabled, uses
+ // this provider to create a cipher stream.
+ // @param fname Name of the writable file
+ // @param underlying The underlying "raw" file
+ // @param options Options for creating the file/cipher
+ // @param prefix_length Returns the length of the encryption prefix used for
+ // this file
+ // @param stream Returns the cipher stream to use for this file if it
+ // should be encrypted
+ // @return OK on success, non-OK on failure.
+ template <class TypeFile>
+ IOStatus CreateRandomWriteCipherStream(
+ const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+ const FileOptions& options, size_t* prefix_length,
+ std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+ EncryptionProvider* provider = nullptr;
+ *prefix_length = 0;
+ IOStatus io_s = GetWritableProvider(fname, &provider);
+ if (!io_s.ok()) {
+ return io_s;
+ } else if (provider != nullptr) {
+ // Initialize & write prefix (if needed)
+ AlignedBuffer buffer;
+ Slice prefix;
+ *prefix_length = provider->GetPrefixLength();
+ if (*prefix_length > 0) {
+ // Initialize prefix
+ buffer.Alignment(underlying->GetRequiredBufferAlignment());
+ buffer.AllocateNewBuffer(*prefix_length);
+ io_s = status_to_io_status(provider->CreateNewPrefix(
+ fname, buffer.BufferStart(), *prefix_length));
+ if (io_s.ok()) {
+ buffer.Size(*prefix_length);
+ prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
+ // Write prefix
+ io_s = underlying->Write(0, prefix, options.io_options, dbg);
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ // Create cipher stream
+ io_s = status_to_io_status(
+ provider->CreateCipherStream(fname, options, prefix, stream));
+ }
+ return io_s;
+ }
+
+ // Creates a CipherStream for the underlying file/name using the options
+ // If a readable provider is found and the file is encrypted, uses
+ // this provider to create a cipher stream.
+ // @param fname Name of the writable file
+ // @param underlying The underlying "raw" file
+ // @param options Options for creating the file/cipher
+ // @param prefix_length Returns the length of the encryption prefix used for
+ // this file
+ // @param stream Returns the cipher stream to use for this file if it
+ // is encrypted
+ // @return OK on success, non-OK on failure.
+ template <class TypeFile>
+ IOStatus CreateSequentialCipherStream(
+ const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+ const FileOptions& options, size_t* prefix_length,
+ std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+ // Read prefix (if needed)
+ AlignedBuffer buffer;
+ Slice prefix;
+ *prefix_length = provider_->GetPrefixLength();
+ if (*prefix_length > 0) {
+ // Read prefix
+ buffer.Alignment(underlying->GetRequiredBufferAlignment());
+ buffer.AllocateNewBuffer(*prefix_length);
+ IOStatus status = underlying->Read(*prefix_length, options.io_options,
+ &prefix, buffer.BufferStart(), dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ buffer.Size(*prefix_length);
+ }
+ return status_to_io_status(
+ provider_->CreateCipherStream(fname, options, prefix, stream));
+ }
+
+ // Creates a CipherStream for the underlying file/name using the options
+ // If a readable provider is found and the file is encrypted, uses
+ // this provider to create a cipher stream.
+ // @param fname Name of the writable file
+ // @param underlying The underlying "raw" file
+ // @param options Options for creating the file/cipher
+ // @param prefix_length Returns the length of the encryption prefix used for
+ // this file
+ // @param stream Returns the cipher stream to use for this file if it
+ // is encrypted
+ // @return OK on success, non-OK on failure.
+ template <class TypeFile>
+ IOStatus CreateRandomReadCipherStream(
+ const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+ const FileOptions& options, size_t* prefix_length,
+ std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+ // Read prefix (if needed)
+ AlignedBuffer buffer;
+ Slice prefix;
+ *prefix_length = provider_->GetPrefixLength();
+ if (*prefix_length > 0) {
+ // Read prefix
+ buffer.Alignment(underlying->GetRequiredBufferAlignment());
+ buffer.AllocateNewBuffer(*prefix_length);
+ IOStatus status = underlying->Read(0, *prefix_length, options.io_options,
+ &prefix, buffer.BufferStart(), dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ buffer.Size(*prefix_length);
+ }
+ return status_to_io_status(
+ provider_->CreateCipherStream(fname, options, prefix, stream));
+ }
+
+ public:
+ EncryptedFileSystemImpl(const std::shared_ptr<FileSystem>& base,
+ const std::shared_ptr<EncryptionProvider>& provider)
+ : EncryptedFileSystem(base) {
+ provider_ = provider;
+ RegisterOptions("EncryptionProvider", &provider_, &encrypted_fs_type_info);
+ }
+
+ Status AddCipher(const std::string& descriptor, const char* cipher,
+ size_t len, bool for_write) override {
+ return provider_->AddCipher(descriptor, cipher, len, for_write);
+ }
+
+ // NewSequentialFile opens a file for sequential reading.
+ IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) override {
+ result->reset();
+ if (options.use_mmap_reads) {
+ return IOStatus::InvalidArgument();
+ }
+ // Open file using underlying Env implementation
+ std::unique_ptr<FSSequentialFile> underlying;
+ auto status =
+ FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ uint64_t file_size;
+ status = FileSystemWrapper::GetFileSize(fname, options.io_options,
+ &file_size, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ if (!file_size) {
+ *result = std::move(underlying);
+ return status;
+ }
+ // Create cipher stream
+ std::unique_ptr<BlockAccessCipherStream> stream;
+ size_t prefix_length;
+ status = CreateSequentialCipherStream(fname, underlying, options,
+ &prefix_length, &stream, dbg);
+ if (status.ok()) {
+ result->reset(new EncryptedSequentialFile(
+ std::move(underlying), std::move(stream), prefix_length));
+ }
+ return status;
+ }
+
+ // NewRandomAccessFile opens a file for random read access.
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override {
+ result->reset();
+ if (options.use_mmap_reads) {
+ return IOStatus::InvalidArgument();
+ }
+ // Open file using underlying Env implementation
+ std::unique_ptr<FSRandomAccessFile> underlying;
+ auto status = FileSystemWrapper::NewRandomAccessFile(fname, options,
+ &underlying, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ std::unique_ptr<BlockAccessCipherStream> stream;
+ size_t prefix_length;
+ status = CreateRandomReadCipherStream(fname, underlying, options,
+ &prefix_length, &stream, dbg);
+ if (status.ok()) {
+ if (stream) {
+ result->reset(new EncryptedRandomAccessFile(
+ std::move(underlying), std::move(stream), prefix_length));
+ } else {
+ result->reset(underlying.release());
+ }
+ }
+ return status;
+ }
+
+ // NewWritableFile opens a file for sequential writing.
+ IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ result->reset();
+ if (options.use_mmap_writes) {
+ return IOStatus::InvalidArgument();
+ }
+ // Open file using underlying Env implementation
+ std::unique_ptr<FSWritableFile> underlying;
+ IOStatus status =
+ FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+ }
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores nullptr in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ result->reset();
+ if (options.use_mmap_writes) {
+ return IOStatus::InvalidArgument();
+ }
+ // Open file using underlying Env implementation
+ std::unique_ptr<FSWritableFile> underlying;
+ IOStatus status =
+ FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+ }
+
+ // Reuse an existing file by renaming it and opening it as writable.
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ result->reset();
+ if (options.use_mmap_writes) {
+ return IOStatus::InvalidArgument();
+ }
+ // Open file using underlying Env implementation
+ std::unique_ptr<FSWritableFile> underlying;
+ auto status = FileSystemWrapper::ReuseWritableFile(
+ fname, old_fname, options, &underlying, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+ }
+
+ // Open `fname` for random read and write, if file doesn't exist the file
+ // will be created. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override {
+ result->reset();
+ if (options.use_mmap_reads || options.use_mmap_writes) {
+ return IOStatus::InvalidArgument();
+ }
+ // Check file exists
+ bool isNewFile = !FileExists(fname, options.io_options, dbg).ok();
+
+ // Open file using underlying Env implementation
+ std::unique_ptr<FSRandomRWFile> underlying;
+ auto status =
+ FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ // Create cipher stream
+ std::unique_ptr<BlockAccessCipherStream> stream;
+ size_t prefix_length = 0;
+ if (!isNewFile) {
+ // File already exists, read prefix
+ status = CreateRandomReadCipherStream(fname, underlying, options,
+ &prefix_length, &stream, dbg);
+ } else {
+ status = CreateRandomWriteCipherStream(fname, underlying, options,
+ &prefix_length, &stream, dbg);
+ }
+ if (status.ok()) {
+ if (stream) {
+ result->reset(new EncryptedRandomRWFile(
+ std::move(underlying), std::move(stream), prefix_length));
+ } else {
+ result->reset(underlying.release());
+ }
+ }
+ return status;
+ }
+
+ // Store in *result the attributes of the children of the specified
+ // directory.
+ // In case the implementation lists the directory prior to iterating the
+ // files
+ // and files are concurrently deleted, the deleted files will be omitted
+ // from
+ // result.
+ // The name attributes are relative to "dir".
+ // Original contents of *results are dropped.
+ // Returns OK if "dir" exists and "*result" contains its children.
+ // NotFound if "dir" does not exist, the calling process does not
+ // have
+ // permission to access "dir", or if "dir" is invalid.
+ // IOError if an IO Error was encountered
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override {
+ auto status =
+ FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ for (auto it = std::begin(*result); it != std::end(*result); ++it) {
+ // assert(it->size_bytes >= prefixLength);
+ // breaks env_basic_test when called on directory containing
+ // directories
+ // which makes subtraction of prefixLength worrisome since
+ // FileAttributes does not identify directories
+ EncryptionProvider* provider;
+ status = GetReadableProvider(it->name, &provider);
+ if (!status.ok()) {
+ return status;
+ } else if (provider != nullptr) {
+ it->size_bytes -= provider->GetPrefixLength();
+ }
+ }
+ return IOStatus::OK();
+ }
+
+ // Store the size of fname in *file_size.
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+ uint64_t* file_size, IODebugContext* dbg) override {
+ auto status =
+ FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+ if (!status.ok() || !(*file_size)) {
+ return status;
+ }
+ EncryptionProvider* provider;
+ status = GetReadableProvider(fname, &provider);
+ if (provider != nullptr && status.ok()) {
+ size_t prefixLength = provider->GetPrefixLength();
+ assert(*file_size >= prefixLength);
+ *file_size -= prefixLength;
+ }
+ return status;
+ }
+
+ private:
+ std::shared_ptr<EncryptionProvider> provider_;
+};
+} // namespace
+
+Status NewEncryptedFileSystemImpl(
+ const std::shared_ptr<FileSystem>& base,
+ const std::shared_ptr<EncryptionProvider>& provider,
+ std::unique_ptr<FileSystem>* result) {
+ result->reset(new EncryptedFileSystemImpl(base, provider));
+ return Status::OK();
+}
+
+std::shared_ptr<FileSystem> NewEncryptedFS(
+ const std::shared_ptr<FileSystem>& base,
+ const std::shared_ptr<EncryptionProvider>& provider) {
+ std::unique_ptr<FileSystem> efs;
+ Status s = NewEncryptedFileSystemImpl(base, provider, &efs);
+ if (s.ok()) {
+ s = efs->PrepareOptions(ConfigOptions());
+ }
+ if (s.ok()) {
+ std::shared_ptr<FileSystem> result(efs.release());
+ return result;
+ } else {
+ return nullptr;
+ }
+}
+// Returns an Env that encrypts data when stored on disk and decrypts data when
+// read from disk.
+Env* NewEncryptedEnv(Env* base_env,
+ const std::shared_ptr<EncryptionProvider>& provider) {
+ return new CompositeEnvWrapper(
+ base_env, NewEncryptedFS(base_env->GetFileSystem(), provider));
+}
+
+// Encrypt one or more (partial) blocks of data at the file offset.
+// Length of data is given in dataSize.
+Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data,
+ size_t dataSize) {
+ // Calculate block index
+ auto blockSize = BlockSize();
+ uint64_t blockIndex = fileOffset / blockSize;
+ size_t blockOffset = fileOffset % blockSize;
+ std::unique_ptr<char[]> blockBuffer;
+
+ std::string scratch;
+ AllocateScratch(scratch);
+
+ // Encrypt individual blocks.
+ while (1) {
+ char* block = data;
+ size_t n = std::min(dataSize, blockSize - blockOffset);
+ if (n != blockSize) {
+ // We're not encrypting a full block.
+ // Copy data to blockBuffer
+ if (!blockBuffer.get()) {
+ // Allocate buffer
+ blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
+ }
+ block = blockBuffer.get();
+ // Copy plain data to block buffer
+ memmove(block + blockOffset, data, n);
+ }
+ auto status = EncryptBlock(blockIndex, block, (char*)scratch.data());
+ if (!status.ok()) {
+ return status;
+ }
+ if (block != data) {
+ // Copy encrypted data back to `data`.
+ memmove(data, block + blockOffset, n);
+ }
+ dataSize -= n;
+ if (dataSize == 0) {
+ return Status::OK();
+ }
+ data += n;
+ blockOffset = 0;
+ blockIndex++;
+ }
+}
+
+// Decrypt one or more (partial) blocks of data at the file offset.
+// Length of data is given in dataSize.
+Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data,
+ size_t dataSize) {
+ // Calculate block index
+ auto blockSize = BlockSize();
+ uint64_t blockIndex = fileOffset / blockSize;
+ size_t blockOffset = fileOffset % blockSize;
+ std::unique_ptr<char[]> blockBuffer;
+
+ std::string scratch;
+ AllocateScratch(scratch);
+
+ // Decrypt individual blocks.
+ while (1) {
+ char* block = data;
+ size_t n = std::min(dataSize, blockSize - blockOffset);
+ if (n != blockSize) {
+ // We're not decrypting a full block.
+ // Copy data to blockBuffer
+ if (!blockBuffer.get()) {
+ // Allocate buffer
+ blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
+ }
+ block = blockBuffer.get();
+ // Copy encrypted data to block buffer
+ memmove(block + blockOffset, data, n);
+ }
+ auto status = DecryptBlock(blockIndex, block, (char*)scratch.data());
+ if (!status.ok()) {
+ return status;
+ }
+ if (block != data) {
+ // Copy decrypted data back to `data`.
+ memmove(data, block + blockOffset, n);
+ }
+
+ // Simply decrementing dataSize by n could cause it to underflow,
+ // which will very likely make it read over the original bounds later
+ assert(dataSize >= n);
+ if (dataSize < n) {
+ return Status::Corruption("Cannot decrypt data at given offset");
+ }
+
+ dataSize -= n;
+ if (dataSize == 0) {
+ return Status::OK();
+ }
+ data += n;
+ blockOffset = 0;
+ blockIndex++;
+ }
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+ rot13_block_cipher_type_info = {
+ {"block_size",
+ {0 /* No offset, whole struct*/, OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+// Implements a BlockCipher using ROT13.
+//
+// Note: This is a sample implementation of BlockCipher,
+// it is NOT considered safe and should NOT be used in production.
+class ROT13BlockCipher : public BlockCipher {
+ private:
+ size_t blockSize_;
+
+ public:
+ explicit ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {
+ RegisterOptions("ROT13BlockCipherOptions", &blockSize_,
+ &rot13_block_cipher_type_info);
+ }
+
+ static const char* kClassName() { return "ROT13"; }
+ const char* Name() const override { return kClassName(); }
+ // BlockSize returns the size of each block supported by this cipher stream.
+ size_t BlockSize() override { return blockSize_; }
+
+ // Encrypt a block of data.
+ // Length of data is equal to BlockSize().
+ Status Encrypt(char* data) override {
+ for (size_t i = 0; i < blockSize_; ++i) {
+ data[i] += 13;
+ }
+ return Status::OK();
+ }
+
+ // Decrypt a block of data.
+ // Length of data is equal to BlockSize().
+ Status Decrypt(char* data) override { return Encrypt(data); }
+};
+static const std::unordered_map<std::string, OptionTypeInfo>
+ ctr_encryption_provider_type_info = {
+ {"cipher",
+ OptionTypeInfo::AsCustomSharedPtr<BlockCipher>(
+ 0 /* No offset, whole struct*/, OptionVerificationType::kByName,
+ OptionTypeFlags::kNone)},
+};
+} // anonymous namespace
+
+// Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+void CTRCipherStream::AllocateScratch(std::string& scratch) {
+ auto blockSize = cipher_->BlockSize();
+ scratch.reserve(blockSize);
+}
+
+// Encrypt a block of data at the given block index.
+// Length of data is equal to BlockSize();
+Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) {
+ // Create nonce + counter
+ auto blockSize = cipher_->BlockSize();
+ memmove(scratch, iv_.data(), blockSize);
+ EncodeFixed64(scratch, blockIndex + initialCounter_);
+
+ // Encrypt nonce+counter
+ auto status = cipher_->Encrypt(scratch);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // XOR data with ciphertext.
+ for (size_t i = 0; i < blockSize; i++) {
+ data[i] = data[i] ^ scratch[i];
+ }
+ return Status::OK();
+}
+
+// Decrypt a block of data at the given block index.
+// Length of data is equal to BlockSize();
+Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data,
+ char* scratch) {
+ // For CTR decryption & encryption are the same
+ return EncryptBlock(blockIndex, data, scratch);
+}
+
+CTREncryptionProvider::CTREncryptionProvider(
+ const std::shared_ptr<BlockCipher>& c)
+ : cipher_(c) {
+ RegisterOptions("Cipher", &cipher_, &ctr_encryption_provider_type_info);
+}
+
+bool CTREncryptionProvider::IsInstanceOf(const std::string& name) const {
+ // Special case for test purposes.
+ if (name == "1://test" && cipher_ != nullptr) {
+ return cipher_->IsInstanceOf(ROT13BlockCipher::kClassName());
+ } else {
+ return EncryptionProvider::IsInstanceOf(name);
+ }
+}
+
+// GetPrefixLength returns the length of the prefix that is added to every file
+// and used for storing encryption options.
+// For optimal performance, the prefix length should be a multiple of
+// the page size.
+size_t CTREncryptionProvider::GetPrefixLength() const {
+ return defaultPrefixLength;
+}
+
+Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/,
+ const char* cipher, size_t len,
+ bool /*for_write*/) {
+ if (cipher_) {
+ return Status::NotSupported("Cannot add keys to CTREncryptionProvider");
+ } else if (strcmp(ROT13BlockCipher::kClassName(), cipher) == 0) {
+ cipher_.reset(new ROT13BlockCipher(len));
+ return Status::OK();
+ } else {
+ return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher),
+ &cipher_);
+ }
+}
+
+// decodeCTRParameters decodes the initial counter & IV from the given
+// (plain text) prefix.
+static void decodeCTRParameters(const char* prefix, size_t blockSize,
+ uint64_t& initialCounter, Slice& iv) {
+ // First block contains 64-bit initial counter
+ initialCounter = DecodeFixed64(prefix);
+ // Second block contains IV
+ iv = Slice(prefix + blockSize, blockSize);
+}
+
+// CreateNewPrefix initialized an allocated block of prefix memory
+// for a new file.
+Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
+ char* prefix,
+ size_t prefixLength) const {
+ if (!cipher_) {
+ return Status::InvalidArgument("Encryption Cipher is missing");
+ }
+ // Create & seed rnd.
+ Random rnd((uint32_t)SystemClock::Default()->NowMicros());
+ // Fill entire prefix block with random values.
+ for (size_t i = 0; i < prefixLength; i++) {
+ prefix[i] = rnd.Uniform(256) & 0xFF;
+ }
+ // Take random data to extract initial counter & IV
+ auto blockSize = cipher_->BlockSize();
+ uint64_t initialCounter;
+ Slice prefixIV;
+ decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV);
+
+ // Now populate the rest of the prefix, starting from the third block.
+ PopulateSecretPrefixPart(prefix + (2 * blockSize),
+ prefixLength - (2 * blockSize), blockSize);
+
+ // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial
+ // counter & IV unencrypted)
+ CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter);
+ Status status;
+ {
+ PERF_TIMER_GUARD(encrypt_data_nanos);
+ status = cipherStream.Encrypt(0, prefix + (2 * blockSize),
+ prefixLength - (2 * blockSize));
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ return Status::OK();
+}
+
+// PopulateSecretPrefixPart initializes the data into a new prefix block
+// in plain text.
+// Returns the amount of space (starting from the start of the prefix)
+// that has been initialized.
+size_t CTREncryptionProvider::PopulateSecretPrefixPart(
+ char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const {
+ // Nothing to do here, put in custom data in override when needed.
+ return 0;
+}
+
+Status CTREncryptionProvider::CreateCipherStream(
+ const std::string& fname, const EnvOptions& options, Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result) {
+ if (!cipher_) {
+ return Status::InvalidArgument("Encryption Cipher is missing");
+ }
+ // Read plain text part of prefix.
+ auto blockSize = cipher_->BlockSize();
+ uint64_t initialCounter;
+ Slice iv;
+ decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv);
+
+ // If the prefix is smaller than twice the block size, we would below read a
+ // very large chunk of the file (and very likely read over the bounds)
+ assert(prefix.size() >= 2 * blockSize);
+ if (prefix.size() < 2 * blockSize) {
+ return Status::Corruption("Unable to read from file " + fname +
+ ": read attempt would read beyond file bounds");
+ }
+
+ // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1
+ // with initial counter & IV are unencrypted)
+ CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter);
+ Status status;
+ {
+ PERF_TIMER_GUARD(decrypt_data_nanos);
+ status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize),
+ prefix.size() - (2 * blockSize));
+ }
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create cipher stream
+ return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv,
+ prefix, result);
+}
+
+// CreateCipherStreamFromPrefix creates a block access cipher stream for a file
+// given given name and options. The given prefix is already decrypted.
+Status CTREncryptionProvider::CreateCipherStreamFromPrefix(
+ const std::string& /*fname*/, const EnvOptions& /*options*/,
+ uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/,
+ std::unique_ptr<BlockAccessCipherStream>* result) {
+ (*result) = std::unique_ptr<BlockAccessCipherStream>(
+ new CTRCipherStream(cipher_, iv.data(), initialCounter));
+ return Status::OK();
+}
+
+namespace {
+static void RegisterEncryptionBuiltins() {
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ auto lib = ObjectRegistry::Default()->AddLibrary("encryption");
+ // Match "CTR" or "CTR://test"
+ lib->AddFactory<EncryptionProvider>(
+ ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true)
+ .AddSuffix("://test"),
+ [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
+ std::string* /*errmsg*/) {
+ if (EndsWith(uri, "://test")) {
+ std::shared_ptr<BlockCipher> cipher =
+ std::make_shared<ROT13BlockCipher>(32);
+ guard->reset(new CTREncryptionProvider(cipher));
+ } else {
+ guard->reset(new CTREncryptionProvider());
+ }
+ return guard->get();
+ });
+
+ lib->AddFactory<EncryptionProvider>(
+ "1://test", [](const std::string& /*uri*/,
+ std::unique_ptr<EncryptionProvider>* guard,
+ std::string* /*errmsg*/) {
+ std::shared_ptr<BlockCipher> cipher =
+ std::make_shared<ROT13BlockCipher>(32);
+ guard->reset(new CTREncryptionProvider(cipher));
+ return guard->get();
+ });
+
+ // Match "ROT13" or "ROT13:[0-9]+"
+ lib->AddFactory<BlockCipher>(
+ ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true)
+ .AddNumber(":"),
+ [](const std::string& uri, std::unique_ptr<BlockCipher>* guard,
+ std::string* /* errmsg */) {
+ size_t colon = uri.find(':');
+ if (colon != std::string::npos) {
+ size_t block_size = ParseSizeT(uri.substr(colon + 1));
+ guard->reset(new ROT13BlockCipher(block_size));
+ } else {
+ guard->reset(new ROT13BlockCipher(32));
+ }
+
+ return guard->get();
+ });
+ });
+}
+} // namespace
+
+Status BlockCipher::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<BlockCipher>* result) {
+ RegisterEncryptionBuiltins();
+ return LoadSharedObject<BlockCipher>(config_options, value, nullptr, result);
+}
+
+Status EncryptionProvider::CreateFromString(
+ const ConfigOptions& config_options, const std::string& value,
+ std::shared_ptr<EncryptionProvider>* result) {
+ RegisterEncryptionBuiltins();
+ return LoadSharedObject<EncryptionProvider>(config_options, value, nullptr,
+ result);
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/env_encryption_ctr.h b/src/rocksdb/env/env_encryption_ctr.h
new file mode 100644
index 000000000..cfb440c72
--- /dev/null
+++ b/src/rocksdb/env/env_encryption_ctr.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include "rocksdb/env_encryption.h"
+
+namespace ROCKSDB_NAMESPACE {
+// CTRCipherStream implements BlockAccessCipherStream using an
+// Counter operations mode.
+// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation
+//
+// Note: This is a possible implementation of BlockAccessCipherStream,
+// it is considered suitable for use.
+class CTRCipherStream final : public BlockAccessCipherStream {
+ private:
+ std::shared_ptr<BlockCipher> cipher_;
+ std::string iv_;
+ uint64_t initialCounter_;
+
+ public:
+ CTRCipherStream(const std::shared_ptr<BlockCipher>& c, const char* iv,
+ uint64_t initialCounter)
+ : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){};
+ virtual ~CTRCipherStream(){};
+
+ // BlockSize returns the size of each block supported by this cipher stream.
+ size_t BlockSize() override { return cipher_->BlockSize(); }
+
+ protected:
+ // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+ void AllocateScratch(std::string&) override;
+
+ // Encrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override;
+
+ // Decrypt a block of data at the given block index.
+ // Length of data is equal to BlockSize();
+ Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override;
+};
+
+// This encryption provider uses a CTR cipher stream, with a given block cipher
+// and IV.
+//
+// Note: This is a possible implementation of EncryptionProvider,
+// it is considered suitable for use, provided a safe BlockCipher is used.
+class CTREncryptionProvider : public EncryptionProvider {
+ private:
+ std::shared_ptr<BlockCipher> cipher_;
+
+ protected:
+ // For optimal performance when using direct IO, the prefix length should be a
+ // multiple of the page size. This size is to ensure the first real data byte
+ // is placed at largest known alignment point for direct io.
+ const static size_t defaultPrefixLength = 4096;
+
+ public:
+ explicit CTREncryptionProvider(
+ const std::shared_ptr<BlockCipher>& c = nullptr);
+ virtual ~CTREncryptionProvider() {}
+
+ static const char* kClassName() { return "CTR"; }
+ const char* Name() const override { return kClassName(); }
+ bool IsInstanceOf(const std::string& name) const override;
+ // GetPrefixLength returns the length of the prefix that is added to every
+ // file
+ // and used for storing encryption options.
+ // For optimal performance when using direct IO, the prefix length should be a
+ // multiple of the page size.
+ size_t GetPrefixLength() const override;
+
+ // CreateNewPrefix initialized an allocated block of prefix memory
+ // for a new file.
+ Status CreateNewPrefix(const std::string& fname, char* prefix,
+ size_t prefixLength) const override;
+
+ // CreateCipherStream creates a block access cipher stream for a file given
+ // given name and options.
+ Status CreateCipherStream(
+ const std::string& fname, const EnvOptions& options, Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result) override;
+
+ Status AddCipher(const std::string& descriptor, const char* /*cipher*/,
+ size_t /*len*/, bool /*for_write*/) override;
+
+ protected:
+ // PopulateSecretPrefixPart initializes the data into a new prefix block
+ // that will be encrypted. This function will store the data in plain text.
+ // It will be encrypted later (before written to disk).
+ // Returns the amount of space (starting from the start of the prefix)
+ // that has been initialized.
+ virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength,
+ size_t blockSize) const;
+
+ // CreateCipherStreamFromPrefix creates a block access cipher stream for a
+ // file given
+ // given name and options. The given prefix is already decrypted.
+ virtual Status CreateCipherStreamFromPrefix(
+ const std::string& fname, const EnvOptions& options,
+ uint64_t initialCounter, const Slice& iv, const Slice& prefix,
+ std::unique_ptr<BlockAccessCipherStream>* result);
+};
+
+Status NewEncryptedFileSystemImpl(
+ const std::shared_ptr<FileSystem>& base_fs,
+ const std::shared_ptr<EncryptionProvider>& provider,
+ std::unique_ptr<FileSystem>* fs);
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/env/env_posix.cc b/src/rocksdb/env/env_posix.cc
new file mode 100644
index 000000000..77f28e1f5
--- /dev/null
+++ b/src/rocksdb/env/env_posix.cc
@@ -0,0 +1,520 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#include "port/lang.h"
+#if !defined(OS_WIN)
+
+#include <dirent.h>
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#endif
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
+#include <sys/statfs.h>
+#endif
+#include <sys/statvfs.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <sys/uio.h>
+#endif
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+// Get nano time includes
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
+#elif defined(__MACH__)
+#include <Availability.h>
+#include <mach/clock.h>
+#include <mach/mach.h>
+#else
+#include <chrono>
+#endif
+#include <deque>
+#include <set>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "env/io_posix.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "port/port.h"
+#include "port/sys_time.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/thread_local.h"
+#include "util/threadpool_imp.h"
+
+#if !defined(TMPFS_MAGIC)
+#define TMPFS_MAGIC 0x01021994
+#endif
+#if !defined(XFS_SUPER_MAGIC)
+#define XFS_SUPER_MAGIC 0x58465342
+#endif
+#if !defined(EXT4_SUPER_MAGIC)
+#define EXT4_SUPER_MAGIC 0xEF53
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+#if defined(OS_WIN)
+static const std::string kSharedLibExt = ".dll";
+static const char kPathSeparator = ';';
+#else
+static const char kPathSeparator = ':';
+#if defined(OS_MACOSX)
+static const std::string kSharedLibExt = ".dylib";
+#else
+static const std::string kSharedLibExt = ".so";
+#endif
+#endif
+
+namespace {
+
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+ return new ThreadStatusUpdater();
+}
+
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+class PosixDynamicLibrary : public DynamicLibrary {
+ public:
+ PosixDynamicLibrary(const std::string& name, void* handle)
+ : name_(name), handle_(handle) {}
+ ~PosixDynamicLibrary() override { dlclose(handle_); }
+
+ Status LoadSymbol(const std::string& sym_name, void** func) override {
+ assert(nullptr != func);
+ dlerror(); // Clear any old error
+ *func = dlsym(handle_, sym_name.c_str());
+ if (*func != nullptr) {
+ return Status::OK();
+ } else {
+ char* err = dlerror();
+ return Status::NotFound("Error finding symbol: " + sym_name, err);
+ }
+ }
+
+ const char* Name() const override { return name_.c_str(); }
+
+ private:
+ std::string name_;
+ void* handle_;
+};
+#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+class PosixClock : public SystemClock {
+ public:
+ static const char* kClassName() { return "PosixClock"; }
+ const char* Name() const override { return kDefaultName(); }
+ const char* NickName() const override { return kClassName(); }
+
+ uint64_t NowMicros() override {
+ port::TimeVal tv;
+ port::GetTimeOfDay(&tv, nullptr);
+ return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+ }
+
+ uint64_t NowNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+ defined(OS_AIX)
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif defined(OS_SOLARIS)
+ return gethrtime();
+#elif defined(__MACH__)
+ clock_serv_t cclock;
+ mach_timespec_t ts;
+ host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+ clock_get_time(cclock, &ts);
+ mach_port_deallocate(mach_task_self(), cclock);
+ return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#else
+ return std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now().time_since_epoch())
+ .count();
+#endif
+ }
+
+ uint64_t CPUMicros() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+ defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+ struct timespec ts;
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+ return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
+#endif
+ return 0;
+ }
+
+ uint64_t CPUNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+ defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+ struct timespec ts;
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+ return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#endif
+ return 0;
+ }
+
+ void SleepForMicroseconds(int micros) override { usleep(micros); }
+
+ Status GetCurrentTime(int64_t* unix_time) override {
+ time_t ret = time(nullptr);
+ if (ret == (time_t)-1) {
+ return IOError("GetCurrentTime", "", errno);
+ }
+ *unix_time = (int64_t)ret;
+ return Status::OK();
+ }
+
+ std::string TimeToString(uint64_t secondsSince1970) override {
+ const time_t seconds = (time_t)secondsSince1970;
+ struct tm t;
+ int maxsize = 64;
+ std::string dummy;
+ dummy.reserve(maxsize);
+ dummy.resize(maxsize);
+ char* p = &dummy[0];
+ port::LocalTimeR(&seconds, &t);
+ snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+ t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+ return dummy;
+ }
+};
+
+class PosixEnv : public CompositeEnv {
+ public:
+ static const char* kClassName() { return "PosixEnv"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kDefaultName(); }
+
+ ~PosixEnv() override {
+ if (this == Env::Default()) {
+ for (const auto tid : threads_to_join_) {
+ pthread_join(tid, nullptr);
+ }
+ for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+ thread_pools_[pool_id].JoinAllThreads();
+ }
+ // Do not delete the thread_status_updater_ in order to avoid the
+ // free after use when Env::Default() is destructed while some other
+ // child threads are still trying to update thread status. All
+ // PosixEnv instances use the same thread_status_updater_, so never
+ // explicitly delete it.
+ }
+ }
+
+ void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
+ if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
+ fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+ }
+ }
+
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+ // Loads the named library into the result.
+ // If the input name is empty, the current executable is loaded
+ // On *nix systems, a "lib" prefix is added to the name if one is not supplied
+ // Comparably, the appropriate shared library extension is added to the name
+ // if not supplied. If search_path is not specified, the shared library will
+ // be loaded using the default path (LD_LIBRARY_PATH) If search_path is
+ // specified, the shared library will be searched for in the directories
+ // provided by the search path
+ Status LoadLibrary(const std::string& name, const std::string& path,
+ std::shared_ptr<DynamicLibrary>* result) override {
+ assert(result != nullptr);
+ if (name.empty()) {
+ void* hndl = dlopen(NULL, RTLD_NOW);
+ if (hndl != nullptr) {
+ result->reset(new PosixDynamicLibrary(name, hndl));
+ return Status::OK();
+ }
+ } else {
+ std::string library_name = name;
+ if (library_name.find(kSharedLibExt) == std::string::npos) {
+ library_name = library_name + kSharedLibExt;
+ }
+#if !defined(OS_WIN)
+ if (library_name.find('/') == std::string::npos &&
+ library_name.compare(0, 3, "lib") != 0) {
+ library_name = "lib" + library_name;
+ }
+#endif
+ if (path.empty()) {
+ void* hndl = dlopen(library_name.c_str(), RTLD_NOW);
+ if (hndl != nullptr) {
+ result->reset(new PosixDynamicLibrary(library_name, hndl));
+ return Status::OK();
+ }
+ } else {
+ std::string local_path;
+ std::stringstream ss(path);
+ while (getline(ss, local_path, kPathSeparator)) {
+ if (!path.empty()) {
+ std::string full_name = local_path + "/" + library_name;
+ void* hndl = dlopen(full_name.c_str(), RTLD_NOW);
+ if (hndl != nullptr) {
+ result->reset(new PosixDynamicLibrary(full_name, hndl));
+ return Status::OK();
+ }
+ }
+ }
+ }
+ }
+ return Status::IOError(
+ IOErrorMsg("Failed to open shared library: xs", name), dlerror());
+ }
+#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+ void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW,
+ void* tag = nullptr,
+ void (*unschedFunction)(void* arg) = nullptr) override;
+
+ int UnSchedule(void* arg, Priority pri) override;
+
+ void StartThread(void (*function)(void* arg), void* arg) override;
+
+ void WaitForJoin() override;
+
+ unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
+
+ int ReserveThreads(int threads_to_be_reserved, Priority pri) override;
+
+ int ReleaseThreads(int threads_to_be_released, Priority pri) override;
+
+ Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+ assert(thread_status_updater_);
+ return thread_status_updater_->GetThreadList(thread_list);
+ }
+
+ uint64_t GetThreadID() const override {
+ uint64_t thread_id = 0;
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 30)
+ thread_id = ::gettid();
+#else // __GLIBC_PREREQ(2, 30)
+ pthread_t tid = pthread_self();
+ memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+#endif // __GLIBC_PREREQ(2, 30)
+#else // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+ pthread_t tid = pthread_self();
+ memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+#endif // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+ return thread_id;
+ }
+
+ Status GetHostName(char* name, uint64_t len) override {
+ int ret = gethostname(name, static_cast<size_t>(len));
+ if (ret < 0) {
+ if (errno == EFAULT || errno == EINVAL) {
+ return Status::InvalidArgument(errnoStr(errno).c_str());
+ } else {
+ return IOError("GetHostName", name, errno);
+ }
+ }
+ return Status::OK();
+ }
+
+ ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+ return Env::GetThreadStatusUpdater();
+ }
+
+ std::string GenerateUniqueId() override { return Env::GenerateUniqueId(); }
+
+ // Allow increasing the number of worker threads.
+ void SetBackgroundThreads(int num, Priority pri) override {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ thread_pools_[pri].SetBackgroundThreads(num);
+ }
+
+ int GetBackgroundThreads(Priority pri) override {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ return thread_pools_[pri].GetBackgroundThreads();
+ }
+
+ Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+ allow_non_owner_access_ = allow_non_owner_access;
+ return Status::OK();
+ }
+
+ // Allow increasing the number of worker threads.
+ void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+ }
+
+ void LowerThreadPoolIOPriority(Priority pool) override {
+ assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+#ifdef OS_LINUX
+ thread_pools_[pool].LowerIOPriority();
+#else
+ (void)pool;
+#endif
+ }
+
+ void LowerThreadPoolCPUPriority(Priority pool) override {
+ assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+ thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow);
+ }
+
+ Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+ assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+ thread_pools_[pool].LowerCPUPriority(pri);
+ return Status::OK();
+ }
+
+ private:
+ friend Env* Env::Default();
+ // Constructs the default Env, a singleton
+ PosixEnv();
+
+ // The below 4 members are only used by the default PosixEnv instance.
+ // Non-default instances simply maintain references to the backing
+ // members in te default instance
+ std::vector<ThreadPoolImpl> thread_pools_storage_;
+ pthread_mutex_t mu_storage_;
+ std::vector<pthread_t> threads_to_join_storage_;
+ bool allow_non_owner_access_storage_;
+
+ std::vector<ThreadPoolImpl>& thread_pools_;
+ pthread_mutex_t& mu_;
+ std::vector<pthread_t>& threads_to_join_;
+ // If true, allow non owner read access for db files. Otherwise, non-owner
+ // has no access to db files.
+ bool& allow_non_owner_access_;
+};
+
+PosixEnv::PosixEnv()
+ : CompositeEnv(FileSystem::Default(), SystemClock::Default()),
+ thread_pools_storage_(Priority::TOTAL),
+ allow_non_owner_access_storage_(true),
+ thread_pools_(thread_pools_storage_),
+ mu_(mu_storage_),
+ threads_to_join_(threads_to_join_storage_),
+ allow_non_owner_access_(allow_non_owner_access_storage_) {
+ ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+ for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+ thread_pools_[pool_id].SetThreadPriority(
+ static_cast<Env::Priority>(pool_id));
+ // This allows later initializing the thread-local-env of each thread.
+ thread_pools_[pool_id].SetHostEnv(this);
+ }
+ thread_status_updater_ = CreateThreadStatusUpdater();
+}
+
+void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri,
+ void* tag, void (*unschedFunction)(void* arg)) {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
+}
+
+int PosixEnv::UnSchedule(void* arg, Priority pri) {
+ return thread_pools_[pri].UnSchedule(arg);
+}
+
+unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ return thread_pools_[pri].GetQueueLen();
+}
+
+int PosixEnv::ReserveThreads(int threads_to_reserved, Priority pri) {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ return thread_pools_[pri].ReserveThreads(threads_to_reserved);
+}
+
+int PosixEnv::ReleaseThreads(int threads_to_released, Priority pri) {
+ assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+ return thread_pools_[pri].ReleaseThreads(threads_to_released);
+}
+
+struct StartThreadState {
+ void (*user_function)(void*);
+ void* arg;
+};
+
+static void* StartThreadWrapper(void* arg) {
+ StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
+ state->user_function(state->arg);
+ delete state;
+ return nullptr;
+}
+
+void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+ pthread_t t;
+ StartThreadState* state = new StartThreadState;
+ state->user_function = function;
+ state->arg = arg;
+ ThreadPoolImpl::PthreadCall(
+ "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state));
+ ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_));
+ threads_to_join_.push_back(t);
+ ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+void PosixEnv::WaitForJoin() {
+ for (const auto tid : threads_to_join_) {
+ pthread_join(tid, nullptr);
+ }
+ threads_to_join_.clear();
+}
+
+} // namespace
+
+//
+// Default Posix Env
+//
+Env* Env::Default() {
+ // The following function call initializes the singletons of ThreadLocalPtr
+ // right before the static default_env. This guarantees default_env will
+ // always being destructed before the ThreadLocalPtr singletons get
+ // destructed as C++ guarantees that the destructions of static variables
+ // is in the reverse order of their constructions.
+ //
+ // Since static members are destructed in the reverse order
+ // of their construction, having this call here guarantees that
+ // the destructor of static PosixEnv will go first, then the
+ // the singletons of ThreadLocalPtr.
+ ThreadLocalPtr::InitSingletons();
+ CompressionContextCache::InitSingleton();
+ INIT_SYNC_POINT_SINGLETONS();
+ // ~PosixEnv must be called on exit
+ //**TODO: Can we make this a STATIC_AVOID_DESTRUCTION?
+ static PosixEnv default_env;
+ return &default_env;
+}
+
+//
+// Default Posix SystemClock
+//
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, instance)
+ (std::make_shared<PosixClock>());
+ return instance;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/env/env_test.cc b/src/rocksdb/env/env_test.cc
new file mode 100644
index 000000000..f4e9d50b2
--- /dev/null
+++ b/src/rocksdb/env/env_test.cc
@@ -0,0 +1,3562 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef OS_WIN
+#include <sys/ioctl.h>
+#endif
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#include <sys/uio.h>
+#endif
+
+#include <sys/types.h>
+
+#include <atomic>
+#include <list>
+#include <mutex>
+#include <unordered_set>
+
+#ifdef OS_LINUX
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+#include <errno.h>
+#endif
+
+#include "db/db_impl/db_impl.h"
+#include "env/emulated_clock.h"
+#include "env/env_chroot.h"
+#include "env/env_encryption_ctr.h"
+#include "env/fs_readonly.h"
+#include "env/mock_env.h"
+#include "env/unique_id_gen.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "options/options_helper.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/counted_fs.h"
+#include "utilities/env_timed.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using port::kPageSize;
+
+static const int kDelayMicros = 100000;
+
+struct Deleter {
+ explicit Deleter(void (*fn)(void*)) : fn_(fn) {}
+
+ void operator()(void* ptr) {
+ assert(fn_);
+ assert(ptr);
+ (*fn_)(ptr);
+ }
+
+ void (*fn_)(void*);
+};
+
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
+std::unique_ptr<char, Deleter> NewAligned(const size_t size, const char ch) {
+ char* ptr = nullptr;
+#ifdef OS_WIN
+ if (nullptr ==
+ (ptr = reinterpret_cast<char*>(_aligned_malloc(size, kPageSize)))) {
+ return std::unique_ptr<char, Deleter>(nullptr, Deleter(_aligned_free));
+ }
+ std::unique_ptr<char, Deleter> uptr(ptr, Deleter(_aligned_free));
+#else
+ if (posix_memalign(reinterpret_cast<void**>(&ptr), kPageSize, size) != 0) {
+ return std::unique_ptr<char, Deleter>(nullptr, Deleter(free));
+ }
+ std::unique_ptr<char, Deleter> uptr(ptr, Deleter(free));
+#endif
+ memset(uptr.get(), ch, size);
+ return uptr;
+}
+
+class EnvPosixTest : public testing::Test {
+ private:
+ port::Mutex mu_;
+ std::string events_;
+
+ public:
+ Env* env_;
+ bool direct_io_;
+ EnvPosixTest() : env_(Env::Default()), direct_io_(false) {}
+ ~EnvPosixTest() {
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency({});
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+};
+
+class EnvPosixTestWithParam
+ : public EnvPosixTest,
+ public ::testing::WithParamInterface<std::pair<Env*, bool>> {
+ public:
+ EnvPosixTestWithParam() {
+ std::pair<Env*, bool> param_pair = GetParam();
+ env_ = param_pair.first;
+ direct_io_ = param_pair.second;
+ }
+
+ void WaitThreadPoolsEmpty() {
+ // Wait until the thread pools are empty.
+ while (env_->GetThreadPoolQueueLen(Env::Priority::LOW) != 0) {
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ }
+ while (env_->GetThreadPoolQueueLen(Env::Priority::HIGH) != 0) {
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ }
+ }
+
+ ~EnvPosixTestWithParam() override { WaitThreadPoolsEmpty(); }
+};
+
+static void SetBool(void* ptr) {
+ reinterpret_cast<std::atomic<bool>*>(ptr)->store(true);
+}
+
+TEST_F(EnvPosixTest, DISABLED_RunImmediately) {
+ for (int pri = Env::BOTTOM; pri < Env::TOTAL; ++pri) {
+ std::atomic<bool> called(false);
+ env_->SetBackgroundThreads(1, static_cast<Env::Priority>(pri));
+ env_->Schedule(&SetBool, &called, static_cast<Env::Priority>(pri));
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_TRUE(called.load());
+ }
+}
+
+TEST_F(EnvPosixTest, RunEventually) {
+ std::atomic<bool> called(false);
+ env_->StartThread(&SetBool, &called);
+ env_->WaitForJoin();
+ ASSERT_TRUE(called.load());
+}
+
+#ifdef OS_WIN
+TEST_F(EnvPosixTest, AreFilesSame) {
+ {
+ bool tmp;
+ if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+ fprintf(stderr,
+ "skipping EnvBasicTestWithParam.AreFilesSame due to "
+ "unsupported Env::AreFilesSame\n");
+ return;
+ }
+ }
+
+ const EnvOptions soptions;
+ auto* env = Env::Default();
+ std::string same_file_name = test::PerThreadDBPath(env, "same_file");
+ std::string same_file_link_name = same_file_name + "_link";
+
+ std::unique_ptr<WritableFile> same_file;
+ ASSERT_OK(env->NewWritableFile(same_file_name, &same_file, soptions));
+ same_file->Append("random_data");
+ ASSERT_OK(same_file->Flush());
+ same_file.reset();
+
+ ASSERT_OK(env->LinkFile(same_file_name, same_file_link_name));
+ bool result = false;
+ ASSERT_OK(env->AreFilesSame(same_file_name, same_file_link_name, &result));
+ ASSERT_TRUE(result);
+}
+#endif
+
+#ifdef OS_LINUX
+TEST_F(EnvPosixTest, DISABLED_FilePermission) {
+ // Only works for Linux environment
+ if (env_ == Env::Default()) {
+ EnvOptions soptions;
+ std::vector<std::string> fileNames{
+ test::PerThreadDBPath(env_, "testfile"),
+ test::PerThreadDBPath(env_, "testfile1")};
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
+ ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
+ wfile.reset();
+ std::unique_ptr<RandomRWFile> rwfile;
+ ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
+
+ struct stat sb;
+ for (const auto& filename : fileNames) {
+ if (::stat(filename.c_str(), &sb) == 0) {
+ ASSERT_EQ(sb.st_mode & 0777, 0644);
+ }
+ ASSERT_OK(env_->DeleteFile(filename));
+ }
+
+ env_->SetAllowNonOwnerAccess(false);
+ ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
+ ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
+ wfile.reset();
+ ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
+
+ for (const auto& filename : fileNames) {
+ if (::stat(filename.c_str(), &sb) == 0) {
+ ASSERT_EQ(sb.st_mode & 0777, 0600);
+ }
+ ASSERT_OK(env_->DeleteFile(filename));
+ }
+ }
+}
+
+TEST_F(EnvPosixTest, LowerThreadPoolCpuPriority) {
+ std::atomic<CpuPriority> from_priority(CpuPriority::kNormal);
+ std::atomic<CpuPriority> to_priority(CpuPriority::kNormal);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", [&](void* pri) {
+ from_priority.store(*reinterpret_cast<CpuPriority*>(pri));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ThreadPoolImpl::BGThread::AfterSetCpuPriority", [&](void* pri) {
+ to_priority.store(*reinterpret_cast<CpuPriority*>(pri));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ env_->SetBackgroundThreads(1, Env::BOTTOM);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+
+ auto RunTask = [&](Env::Priority pool) {
+ std::atomic<bool> called(false);
+ env_->Schedule(&SetBool, &called, pool);
+ for (int i = 0; i < kDelayMicros; i++) {
+ if (called.load()) {
+ break;
+ }
+ Env::Default()->SleepForMicroseconds(1);
+ }
+ ASSERT_TRUE(called.load());
+ };
+
+ {
+ // Same priority, no-op.
+ env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM,
+ CpuPriority::kNormal)
+ .PermitUncheckedError();
+ RunTask(Env::Priority::BOTTOM);
+ ASSERT_EQ(from_priority, CpuPriority::kNormal);
+ ASSERT_EQ(to_priority, CpuPriority::kNormal);
+ }
+
+ {
+ // Higher priority, no-op.
+ env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kHigh)
+ .PermitUncheckedError();
+ RunTask(Env::Priority::BOTTOM);
+ ASSERT_EQ(from_priority, CpuPriority::kNormal);
+ ASSERT_EQ(to_priority, CpuPriority::kNormal);
+ }
+
+ {
+ // Lower priority from kNormal -> kLow.
+ env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kLow)
+ .PermitUncheckedError();
+ RunTask(Env::Priority::BOTTOM);
+ ASSERT_EQ(from_priority, CpuPriority::kNormal);
+ ASSERT_EQ(to_priority, CpuPriority::kLow);
+ }
+
+ {
+ // Lower priority from kLow -> kIdle.
+ env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kIdle)
+ .PermitUncheckedError();
+ RunTask(Env::Priority::BOTTOM);
+ ASSERT_EQ(from_priority, CpuPriority::kLow);
+ ASSERT_EQ(to_priority, CpuPriority::kIdle);
+ }
+
+ {
+ // Lower priority from kNormal -> kIdle for another pool.
+ env_->LowerThreadPoolCPUPriority(Env::Priority::HIGH, CpuPriority::kIdle)
+ .PermitUncheckedError();
+ RunTask(Env::Priority::HIGH);
+ ASSERT_EQ(from_priority, CpuPriority::kNormal);
+ ASSERT_EQ(to_priority, CpuPriority::kIdle);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif
+
+TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
+ const int kFileBytes = 1 << 15; // 32 KB
+ std::string expected_data;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+ {
+ std::unique_ptr<WritableFile> wfile;
+ const EnvOptions soptions;
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+ Random rnd(301);
+ expected_data = rnd.RandomString(kFileBytes);
+ ASSERT_OK(wfile->Append(expected_data));
+ }
+
+ std::unique_ptr<MemoryMappedFileBuffer> mmap_buffer;
+ Status status = env_->NewMemoryMappedFileBuffer(fname, &mmap_buffer);
+ // it should be supported at least on linux
+#if !defined(OS_LINUX)
+ if (status.IsNotSupported()) {
+ fprintf(stderr,
+ "skipping EnvPosixTest.MemoryMappedFileBuffer due to "
+ "unsupported Env::NewMemoryMappedFileBuffer\n");
+ return;
+ }
+#endif // !defined(OS_LINUX)
+
+ ASSERT_OK(status);
+ ASSERT_NE(nullptr, mmap_buffer.get());
+ ASSERT_NE(nullptr, mmap_buffer->GetBase());
+ ASSERT_EQ(kFileBytes, mmap_buffer->GetLen());
+ std::string actual_data(reinterpret_cast<const char*>(mmap_buffer->GetBase()),
+ mmap_buffer->GetLen());
+ ASSERT_EQ(expected_data, actual_data);
+}
+
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+TEST_F(EnvPosixTest, LoadRocksDBLibrary) {
+ std::shared_ptr<DynamicLibrary> library;
+ std::function<void*(void*, const char*)> function;
+ Status status = env_->LoadLibrary("no-such-library", "", &library);
+ ASSERT_NOK(status);
+ ASSERT_EQ(nullptr, library.get());
+ status = env_->LoadLibrary("rocksdb", "", &library);
+ if (status.ok()) { // If we have can find a rocksdb shared library
+ ASSERT_NE(nullptr, library.get());
+ ASSERT_OK(library->LoadFunction("rocksdb_create_default_env",
+ &function)); // from C definition
+ ASSERT_NE(nullptr, function);
+ ASSERT_NOK(library->LoadFunction("no-such-method", &function));
+ ASSERT_EQ(nullptr, function);
+ ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+ } else {
+ ASSERT_EQ(nullptr, library.get());
+ }
+}
+#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) {
+ std::shared_ptr<DynamicLibrary> library;
+ std::function<void*(void*, const char*)> function;
+ ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library));
+ ASSERT_EQ(nullptr, library.get());
+ ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library));
+ ASSERT_EQ(nullptr, library.get());
+ Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library);
+ if (status.ok()) {
+ ASSERT_NE(nullptr, library.get());
+ ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+ }
+ char buff[1024];
+ std::string cwd = getcwd(buff, sizeof(buff));
+
+ status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library);
+ if (status.ok()) {
+ ASSERT_NE(nullptr, library.get());
+ ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+ }
+}
+#endif // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+TEST_P(EnvPosixTestWithParam, UnSchedule) {
+ std::atomic<bool> called(false);
+ env_->SetBackgroundThreads(1, Env::LOW);
+
+ /* Block the low priority queue */
+ test::SleepingBackgroundTask sleeping_task, sleeping_task1;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::LOW);
+
+ /* Schedule another task */
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task1,
+ Env::Priority::LOW, &sleeping_task1);
+
+ /* Remove it with a different tag */
+ ASSERT_EQ(0, env_->UnSchedule(&called, Env::Priority::LOW));
+
+ /* Remove it from the queue with the right tag */
+ ASSERT_EQ(1, env_->UnSchedule(&sleeping_task1, Env::Priority::LOW));
+
+ // Unblock background thread
+ sleeping_task.WakeUp();
+
+ /* Schedule another task */
+ env_->Schedule(&SetBool, &called);
+ for (int i = 0; i < kDelayMicros; i++) {
+ if (called.load()) {
+ break;
+ }
+ Env::Default()->SleepForMicroseconds(1);
+ }
+ ASSERT_TRUE(called.load());
+
+ ASSERT_TRUE(!sleeping_task.IsSleeping() && !sleeping_task1.IsSleeping());
+ WaitThreadPoolsEmpty();
+}
+
+// This tests assumes that the last scheduled
+// task will run last. In fact, in the allotted
+// sleeping time nothing may actually run or they may
+// run in any order. The purpose of the test is unclear.
+#ifndef OS_WIN
+TEST_P(EnvPosixTestWithParam, RunMany) {
+ env_->SetBackgroundThreads(1, Env::LOW);
+ std::atomic<int> last_id(0);
+
+ struct CB {
+ std::atomic<int>* last_id_ptr; // Pointer to shared slot
+ int id; // Order# for the execution of this callback
+
+ CB(std::atomic<int>* p, int i) : last_id_ptr(p), id(i) {}
+
+ static void Run(void* v) {
+ CB* cb = reinterpret_cast<CB*>(v);
+ int cur = cb->last_id_ptr->load();
+ ASSERT_EQ(cb->id - 1, cur);
+ cb->last_id_ptr->store(cb->id);
+ }
+ };
+
+ // Schedule in different order than start time
+ CB cb1(&last_id, 1);
+ CB cb2(&last_id, 2);
+ CB cb3(&last_id, 3);
+ CB cb4(&last_id, 4);
+ env_->Schedule(&CB::Run, &cb1);
+ env_->Schedule(&CB::Run, &cb2);
+ env_->Schedule(&CB::Run, &cb3);
+ env_->Schedule(&CB::Run, &cb4);
+ // thread-pool pops a thread function and then run the function, which may
+ // cause threadpool is empty but the last function is still running. Add a
+ // dummy function at the end, to make sure the last callback is finished
+ // before threadpool is empty.
+ struct DummyCB {
+ static void Run(void*) {}
+ };
+ env_->Schedule(&DummyCB::Run, nullptr);
+
+ WaitThreadPoolsEmpty();
+ ASSERT_EQ(4, last_id.load(std::memory_order_acquire));
+}
+#endif
+
+struct State {
+ port::Mutex mu;
+ int val;
+ int num_running;
+};
+
+static void ThreadBody(void* arg) {
+ State* s = reinterpret_cast<State*>(arg);
+ s->mu.Lock();
+ s->val += 1;
+ s->num_running -= 1;
+ s->mu.Unlock();
+}
+
+TEST_P(EnvPosixTestWithParam, StartThread) {
+ State state;
+ state.val = 0;
+ state.num_running = 3;
+ for (int i = 0; i < 3; i++) {
+ env_->StartThread(&ThreadBody, &state);
+ }
+ while (true) {
+ state.mu.Lock();
+ int num = state.num_running;
+ state.mu.Unlock();
+ if (num == 0) {
+ break;
+ }
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ }
+ ASSERT_EQ(state.val, 3);
+ WaitThreadPoolsEmpty();
+}
+
+TEST_P(EnvPosixTestWithParam, TwoPools) {
+ // Data structures to signal tasks to run.
+ port::Mutex mutex;
+ port::CondVar cv(&mutex);
+ bool should_start = false;
+
+ class CB {
+ public:
+ CB(const std::string& pool_name, int pool_size, port::Mutex* trigger_mu,
+ port::CondVar* trigger_cv, bool* _should_start)
+ : mu_(),
+ num_running_(0),
+ num_finished_(0),
+ pool_size_(pool_size),
+ pool_name_(pool_name),
+ trigger_mu_(trigger_mu),
+ trigger_cv_(trigger_cv),
+ should_start_(_should_start) {}
+
+ static void Run(void* v) {
+ CB* cb = reinterpret_cast<CB*>(v);
+ cb->Run();
+ }
+
+ void Run() {
+ {
+ MutexLock l(&mu_);
+ num_running_++;
+ // make sure we don't have more than pool_size_ jobs running.
+ ASSERT_LE(num_running_, pool_size_.load());
+ }
+
+ {
+ MutexLock l(trigger_mu_);
+ while (!(*should_start_)) {
+ trigger_cv_->Wait();
+ }
+ }
+
+ {
+ MutexLock l(&mu_);
+ num_running_--;
+ num_finished_++;
+ }
+ }
+
+ int NumFinished() {
+ MutexLock l(&mu_);
+ return num_finished_;
+ }
+
+ void Reset(int pool_size) {
+ pool_size_.store(pool_size);
+ num_finished_ = 0;
+ }
+
+ private:
+ port::Mutex mu_;
+ int num_running_;
+ int num_finished_;
+ std::atomic<int> pool_size_;
+ std::string pool_name_;
+ port::Mutex* trigger_mu_;
+ port::CondVar* trigger_cv_;
+ bool* should_start_;
+ };
+
+ const int kLowPoolSize = 2;
+ const int kHighPoolSize = 4;
+ const int kJobs = 8;
+
+ CB low_pool_job("low", kLowPoolSize, &mutex, &cv, &should_start);
+ CB high_pool_job("high", kHighPoolSize, &mutex, &cv, &should_start);
+
+ env_->SetBackgroundThreads(kLowPoolSize);
+ env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ // schedule same number of jobs in each pool
+ for (int i = 0; i < kJobs; i++) {
+ env_->Schedule(&CB::Run, &low_pool_job);
+ env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+ }
+ // Wait a short while for the jobs to be dispatched.
+ int sleep_count = 0;
+ while ((unsigned int)(kJobs - kLowPoolSize) !=
+ env_->GetThreadPoolQueueLen(Env::Priority::LOW) ||
+ (unsigned int)(kJobs - kHighPoolSize) !=
+ env_->GetThreadPoolQueueLen(Env::Priority::HIGH)) {
+ env_->SleepForMicroseconds(kDelayMicros);
+ if (++sleep_count > 100) {
+ break;
+ }
+ }
+
+ ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+ env_->GetThreadPoolQueueLen());
+ ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+ env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+ ASSERT_EQ((unsigned int)(kJobs - kHighPoolSize),
+ env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ // Trigger jobs to run.
+ {
+ MutexLock l(&mutex);
+ should_start = true;
+ cv.SignalAll();
+ }
+
+ // wait for all jobs to finish
+ while (low_pool_job.NumFinished() < kJobs ||
+ high_pool_job.NumFinished() < kJobs) {
+ env_->SleepForMicroseconds(kDelayMicros);
+ }
+
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ // Hold jobs to schedule;
+ should_start = false;
+
+ // call IncBackgroundThreadsIfNeeded to two pools. One increasing and
+ // the other decreasing
+ env_->IncBackgroundThreadsIfNeeded(kLowPoolSize - 1, Env::Priority::LOW);
+ env_->IncBackgroundThreadsIfNeeded(kHighPoolSize + 1, Env::Priority::HIGH);
+ high_pool_job.Reset(kHighPoolSize + 1);
+ low_pool_job.Reset(kLowPoolSize);
+
+ // schedule same number of jobs in each pool
+ for (int i = 0; i < kJobs; i++) {
+ env_->Schedule(&CB::Run, &low_pool_job);
+ env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+ }
+ // Wait a short while for the jobs to be dispatched.
+ sleep_count = 0;
+ while ((unsigned int)(kJobs - kLowPoolSize) !=
+ env_->GetThreadPoolQueueLen(Env::Priority::LOW) ||
+ (unsigned int)(kJobs - (kHighPoolSize + 1)) !=
+ env_->GetThreadPoolQueueLen(Env::Priority::HIGH)) {
+ env_->SleepForMicroseconds(kDelayMicros);
+ if (++sleep_count > 100) {
+ break;
+ }
+ }
+ ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+ env_->GetThreadPoolQueueLen());
+ ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+ env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+ ASSERT_EQ((unsigned int)(kJobs - (kHighPoolSize + 1)),
+ env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ // Trigger jobs to run.
+ {
+ MutexLock l(&mutex);
+ should_start = true;
+ cv.SignalAll();
+ }
+
+ // wait for all jobs to finish
+ while (low_pool_job.NumFinished() < kJobs ||
+ high_pool_job.NumFinished() < kJobs) {
+ env_->SleepForMicroseconds(kDelayMicros);
+ }
+
+ env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+ WaitThreadPoolsEmpty();
+}
+
+TEST_P(EnvPosixTestWithParam, DecreaseNumBgThreads) {
+ constexpr int kWaitMicros = 60000000; // 1min
+
+ std::vector<test::SleepingBackgroundTask> tasks(10);
+
+ // Set number of thread to 1 first.
+ env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+ // Schedule 3 tasks. 0 running; Task 1, 2 waiting.
+ for (size_t i = 0; i < 3; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
+ Env::Priority::HIGH);
+ }
+ ASSERT_FALSE(tasks[0].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(!tasks[1].IsSleeping());
+ ASSERT_TRUE(!tasks[2].IsSleeping());
+
+ // Increase to 2 threads. Task 0, 1 running; 2 waiting
+ env_->SetBackgroundThreads(2, Env::Priority::HIGH);
+ ASSERT_FALSE(tasks[1].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(tasks[1].IsSleeping());
+ ASSERT_TRUE(!tasks[2].IsSleeping());
+
+ // Shrink back to 1 thread. Still task 0, 1 running, 2 waiting
+ env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(tasks[1].IsSleeping());
+ ASSERT_TRUE(!tasks[2].IsSleeping());
+
+ // The last task finishes. Task 0 running, 2 waiting.
+ tasks[1].WakeUp();
+ ASSERT_FALSE(tasks[1].TimedWaitUntilDone(kWaitMicros));
+ ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(!tasks[1].IsSleeping());
+ ASSERT_TRUE(!tasks[2].IsSleeping());
+
+ // Increase to 5 threads. Task 0 and 2 running.
+ env_->SetBackgroundThreads(5, Env::Priority::HIGH);
+ ASSERT_FALSE(tasks[2].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(!tasks[1].IsSleeping());
+ ASSERT_TRUE(tasks[2].IsSleeping());
+
+ // Change number of threads a couple of times while there is no sufficient
+ // tasks.
+ env_->SetBackgroundThreads(7, Env::Priority::HIGH);
+ tasks[2].WakeUp();
+ ASSERT_FALSE(tasks[2].TimedWaitUntilDone(kWaitMicros));
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ env_->SetBackgroundThreads(5, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ Env::Default()->SleepForMicroseconds(kDelayMicros * 50);
+
+ // Enqueue 5 more tasks. Thread pool size now is 4.
+ // Task 0, 3, 4, 5 running;6, 7 waiting.
+ for (size_t i = 3; i < 8; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
+ Env::Priority::HIGH);
+ }
+ for (size_t i = 3; i <= 5; i++) {
+ ASSERT_FALSE(tasks[i].TimedWaitUntilSleeping(kWaitMicros));
+ }
+ ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(!tasks[1].IsSleeping());
+ ASSERT_TRUE(!tasks[2].IsSleeping());
+ ASSERT_TRUE(tasks[3].IsSleeping());
+ ASSERT_TRUE(tasks[4].IsSleeping());
+ ASSERT_TRUE(tasks[5].IsSleeping());
+ ASSERT_TRUE(!tasks[6].IsSleeping());
+ ASSERT_TRUE(!tasks[7].IsSleeping());
+
+ // Wake up task 0, 3 and 4. Task 5, 6, 7 running.
+ tasks[0].WakeUp();
+ tasks[3].WakeUp();
+ tasks[4].WakeUp();
+
+ for (size_t i = 5; i < 8; i++) {
+ ASSERT_FALSE(tasks[i].TimedWaitUntilSleeping(kWaitMicros));
+ }
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ for (size_t i = 5; i < 8; i++) {
+ ASSERT_TRUE(tasks[i].IsSleeping());
+ }
+
+ // Shrink back to 1 thread. Still task 5, 6, 7 running
+ env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_TRUE(tasks[5].IsSleeping());
+ ASSERT_TRUE(tasks[6].IsSleeping());
+ ASSERT_TRUE(tasks[7].IsSleeping());
+
+ // Wake up task 6. Task 5, 7 running
+ tasks[6].WakeUp();
+ ASSERT_FALSE(tasks[6].TimedWaitUntilDone(kWaitMicros));
+ ASSERT_TRUE(tasks[5].IsSleeping());
+ ASSERT_TRUE(!tasks[6].IsSleeping());
+ ASSERT_TRUE(tasks[7].IsSleeping());
+
+ // Wake up threads 7. Task 5 running
+ tasks[7].WakeUp();
+ ASSERT_FALSE(tasks[7].TimedWaitUntilDone(kWaitMicros));
+ ASSERT_TRUE(!tasks[7].IsSleeping());
+
+ // Enqueue thread 8 and 9. Task 5 running; one of 8, 9 might be running.
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[8],
+ Env::Priority::HIGH);
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[9],
+ Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_GT(env_->GetThreadPoolQueueLen(Env::Priority::HIGH), (unsigned int)0);
+ ASSERT_TRUE(!tasks[8].IsSleeping() || !tasks[9].IsSleeping());
+
+ // Increase to 4 threads. Task 5, 8, 9 running.
+ env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[8].IsSleeping());
+ ASSERT_TRUE(tasks[9].IsSleeping());
+
+ // Shrink to 1 thread
+ env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+ // Wake up thread 9.
+ tasks[9].WakeUp();
+ ASSERT_FALSE(tasks[9].TimedWaitUntilDone(kWaitMicros));
+ ASSERT_TRUE(!tasks[9].IsSleeping());
+ ASSERT_TRUE(tasks[8].IsSleeping());
+
+ // Wake up thread 8
+ tasks[8].WakeUp();
+ ASSERT_FALSE(tasks[8].TimedWaitUntilDone(kWaitMicros));
+ ASSERT_TRUE(!tasks[8].IsSleeping());
+
+ // Wake up the last thread
+ tasks[5].WakeUp();
+ ASSERT_FALSE(tasks[5].TimedWaitUntilDone(kWaitMicros));
+ WaitThreadPoolsEmpty();
+}
+
+TEST_P(EnvPosixTestWithParam, ReserveThreads) {
+ // Initialize the background thread to 1 in case other threads exist
+ // from the last unit test
+ env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+ ASSERT_EQ(env_->GetBackgroundThreads(Env::HIGH), 1);
+ constexpr int kWaitMicros = 10000000; // 10seconds
+ std::vector<test::SleepingBackgroundTask> tasks(4);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ // Set the sync point to ensure thread 0 can terminate
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ThreadPoolImpl::BGThread::Termination:th0",
+ "EnvTest::ReserveThreads:0"}});
+ // Empty the thread pool to ensure all the threads can start later
+ env_->SetBackgroundThreads(0, Env::Priority::HIGH);
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:0");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ // Set the sync point to ensure threads start and pass the sync point
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ThreadPoolImpl::BGThread::Start:th0", "EnvTest::ReserveThreads:1"},
+ {"ThreadPoolImpl::BGThread::Start:th1", "EnvTest::ReserveThreads:2"},
+ {"ThreadPoolImpl::BGThread::Start:th2", "EnvTest::ReserveThreads:3"},
+ {"ThreadPoolImpl::BGThread::Start:th3", "EnvTest::ReserveThreads:4"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Set number of thread to 3 first.
+ env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+ ASSERT_EQ(env_->GetBackgroundThreads(Env::HIGH), 3);
+ // Add sync points to ensure all 3 threads start
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:1");
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:2");
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:3");
+ // Reserve 2 threads
+ ASSERT_EQ(2, env_->ReserveThreads(2, Env::Priority::HIGH));
+
+ // Schedule 3 tasks. Task 0 running (in this context, doing
+ // SleepingBackgroundTask); Task 1, 2 waiting; 3 reserved threads.
+ for (size_t i = 0; i < 3; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
+ Env::Priority::HIGH);
+ }
+ ASSERT_FALSE(tasks[0].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[0].IsSleeping());
+ ASSERT_TRUE(!tasks[1].IsSleeping());
+ ASSERT_TRUE(!tasks[2].IsSleeping());
+
+ // Release 2 threads. Task 0, 1, 2 running; 0 reserved thread.
+ ASSERT_EQ(2, env_->ReleaseThreads(2, Env::Priority::HIGH));
+ ASSERT_FALSE(tasks[1].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_FALSE(tasks[2].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(tasks[1].IsSleeping());
+ ASSERT_TRUE(tasks[2].IsSleeping());
+ // No more threads can be reserved
+ ASSERT_EQ(0, env_->ReserveThreads(3, Env::Priority::HIGH));
+ // Expand the number of background threads so that the last thread
+ // is waiting
+ env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+ // Add sync point to ensure the 4th thread starts
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:4");
+ // As the thread pool is expanded, we can reserve one more thread
+ ASSERT_EQ(1, env_->ReserveThreads(3, Env::Priority::HIGH));
+ // No more threads can be reserved
+ ASSERT_EQ(0, env_->ReserveThreads(3, Env::Priority::HIGH));
+
+ // Reset the sync points for the next iteration in BGThread or the
+ // next time Submit() is called
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"ThreadPoolImpl::BGThread::WaitingThreadsInc",
+ "EnvTest::ReserveThreads:5"},
+ {"ThreadPoolImpl::BGThread::Termination", "EnvTest::ReserveThreads:6"},
+ {"ThreadPoolImpl::Submit::Enqueue", "EnvTest::ReserveThreads:7"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ tasks[0].WakeUp();
+ ASSERT_FALSE(tasks[0].TimedWaitUntilDone(kWaitMicros));
+ // Add sync point to ensure the number of waiting threads increases
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:5");
+ // 1 more thread can be reserved
+ ASSERT_EQ(1, env_->ReserveThreads(3, Env::Priority::HIGH));
+ // 2 reserved threads now
+
+ // Currently, two threads are blocked since the number of waiting
+ // threads is equal to the number of reserved threads (i.e., 2).
+ // If we reduce the number of background thread to 1, at least one thread
+ // will be the last excessive thread (here we have no control over the
+ // number of excessive threads because thread order does not
+ // necessarily follows the schedule order, but we ensure that the last thread
+ // shall not run any task by expanding the thread pool after we schedule
+ // the tasks), and thus they(it) become(s) unblocked, the number of waiting
+ // threads decreases to 0 or 1, but the number of reserved threads is still 2
+ env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+ // Task 1,2 running; 2 reserved threads, however, in fact, we only have
+ // 0 or 1 waiting thread in the thread pool, proved by the
+ // following test, we CANNOT reserve 2 threads even though we just
+ // release 2
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:6");
+ ASSERT_EQ(2, env_->ReleaseThreads(2, Env::Priority::HIGH));
+ ASSERT_GT(2, env_->ReserveThreads(2, Env::Priority::HIGH));
+
+ // Every new task will be put into the queue at this point
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[3],
+ Env::Priority::HIGH);
+ TEST_SYNC_POINT("EnvTest::ReserveThreads:7");
+ ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+ ASSERT_TRUE(!tasks[3].IsSleeping());
+
+ // Set the number of threads to 3 so that Task 3 can dequeue
+ env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+ // Wakup Task 1
+ tasks[1].WakeUp();
+ ASSERT_FALSE(tasks[1].TimedWaitUntilDone(kWaitMicros));
+ // Task 2, 3 running (Task 3 dequeue); 0 or 1 reserved thread
+ ASSERT_FALSE(tasks[3].TimedWaitUntilSleeping(kWaitMicros));
+ ASSERT_TRUE(tasks[3].IsSleeping());
+ ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+ // At most 1 thread can be released
+ ASSERT_GT(2, env_->ReleaseThreads(3, Env::Priority::HIGH));
+ tasks[2].WakeUp();
+ ASSERT_FALSE(tasks[2].TimedWaitUntilDone(kWaitMicros));
+ tasks[3].WakeUp();
+ ASSERT_FALSE(tasks[3].TimedWaitUntilDone(kWaitMicros));
+ WaitThreadPoolsEmpty();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#if (defined OS_LINUX || defined OS_WIN)
+namespace {
+bool IsSingleVarint(const std::string& s) {
+ Slice slice(s);
+
+ uint64_t v;
+ if (!GetVarint64(&slice, &v)) {
+ return false;
+ }
+
+ return slice.size() == 0;
+}
+
+bool IsUniqueIDValid(const std::string& s) {
+ return !s.empty() && !IsSingleVarint(s);
+}
+
+const size_t MAX_ID_SIZE = 100;
+char temp_id[MAX_ID_SIZE];
+
+} // namespace
+
+// Determine whether we can use the FS_IOC_GETVERSION ioctl
+// on a file in directory DIR. Create a temporary file therein,
+// try to apply the ioctl (save that result), cleanup and
+// return the result. Return true if it is supported, and
+// false if anything fails.
+// Note that this function "knows" that dir has just been created
+// and is empty, so we create a simply-named test file: "f".
+bool ioctl_support__FS_IOC_GETVERSION(const std::string& dir) {
+#ifdef OS_WIN
+ return true;
+#else
+ const std::string file = dir + "/f";
+ int fd;
+ do {
+ fd = open(file.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+ } while (fd < 0 && errno == EINTR);
+ long int version;
+ bool ok = (fd >= 0 && ioctl(fd, FS_IOC_GETVERSION, &version) >= 0);
+
+ close(fd);
+ unlink(file.c_str());
+
+ return ok;
+#endif
+}
+
+// To ensure that Env::GetUniqueId-related tests work correctly, the files
+// should be stored in regular storage like "hard disk" or "flash device",
+// and not on a tmpfs file system (like /dev/shm and /tmp on some systems).
+// Otherwise we cannot get the correct id.
+//
+// This function serves as the replacement for test::TmpDir(), which may be
+// customized to be on a file system that doesn't work with GetUniqueId().
+
+class IoctlFriendlyTmpdir {
+ public:
+ explicit IoctlFriendlyTmpdir() {
+ char dir_buf[100];
+
+ const char* fmt = "%s/rocksdb.XXXXXX";
+ const char* tmp = getenv("TEST_IOCTL_FRIENDLY_TMPDIR");
+
+#ifdef OS_WIN
+#define rmdir _rmdir
+ if (tmp == nullptr) {
+ tmp = getenv("TMP");
+ }
+
+ snprintf(dir_buf, sizeof dir_buf, fmt, tmp);
+ auto result = _mktemp(dir_buf);
+ assert(result != nullptr);
+ BOOL ret = CreateDirectory(dir_buf, NULL);
+ assert(ret == TRUE);
+ dir_ = dir_buf;
+#else
+ std::list<std::string> candidate_dir_list = {"/var/tmp", "/tmp"};
+
+ // If $TEST_IOCTL_FRIENDLY_TMPDIR/rocksdb.XXXXXX fits, use
+ // $TEST_IOCTL_FRIENDLY_TMPDIR; subtract 2 for the "%s", and
+ // add 1 for the trailing NUL byte.
+ if (tmp && strlen(tmp) + strlen(fmt) - 2 + 1 <= sizeof dir_buf) {
+ // use $TEST_IOCTL_FRIENDLY_TMPDIR value
+ candidate_dir_list.push_front(tmp);
+ }
+
+ for (const std::string& d : candidate_dir_list) {
+ snprintf(dir_buf, sizeof dir_buf, fmt, d.c_str());
+ if (mkdtemp(dir_buf)) {
+ if (ioctl_support__FS_IOC_GETVERSION(dir_buf)) {
+ dir_ = dir_buf;
+ return;
+ } else {
+ // Diagnose ioctl-related failure only if this is the
+ // directory specified via that envvar.
+ if (tmp && tmp == d) {
+ fprintf(stderr,
+ "TEST_IOCTL_FRIENDLY_TMPDIR-specified directory is "
+ "not suitable: %s\n",
+ d.c_str());
+ }
+ rmdir(dir_buf); // ignore failure
+ }
+ } else {
+ // mkdtemp failed: diagnose it, but don't give up.
+ fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(),
+ errnoStr(errno).c_str());
+ }
+ }
+
+ // check if it's running test within a docker container, in which case, the
+ // file system inside `overlayfs` may not support FS_IOC_GETVERSION
+ // skip the tests
+ struct stat buffer;
+ if (stat("/.dockerenv", &buffer) == 0) {
+ is_supported_ = false;
+ return;
+ }
+
+ fprintf(stderr,
+ "failed to find an ioctl-friendly temporary directory;"
+ " specify one via the TEST_IOCTL_FRIENDLY_TMPDIR envvar\n");
+ std::abort();
+#endif
+ }
+
+ ~IoctlFriendlyTmpdir() { rmdir(dir_.c_str()); }
+
+ const std::string& name() const { return dir_; }
+
+ bool is_supported() const { return is_supported_; }
+
+ private:
+ std::string dir_;
+
+ bool is_supported_ = true;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(EnvPosixTest, PositionedAppend) {
+ std::unique_ptr<WritableFile> writable_file;
+ EnvOptions options;
+ options.use_direct_writes = true;
+ options.use_mmap_writes = false;
+ std::string fname = test::PerThreadDBPath(env_, "positioned_append");
+ SetupSyncPointsToMockDirectIO();
+
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, options));
+ const size_t kBlockSize = 4096;
+ const size_t kDataSize = kPageSize;
+ // Write a page worth of 'a'
+ auto data_ptr = NewAligned(kDataSize, 'a');
+ Slice data_a(data_ptr.get(), kDataSize);
+ ASSERT_OK(writable_file->PositionedAppend(data_a, 0U));
+ // Write a page worth of 'b' right after the first sector
+ data_ptr = NewAligned(kDataSize, 'b');
+ Slice data_b(data_ptr.get(), kDataSize);
+ ASSERT_OK(writable_file->PositionedAppend(data_b, kBlockSize));
+ ASSERT_OK(writable_file->Close());
+ // The file now has 1 sector worth of a followed by a page worth of b
+
+ // Verify the above
+ std::unique_ptr<SequentialFile> seq_file;
+ ASSERT_OK(env_->NewSequentialFile(fname, &seq_file, options));
+ size_t scratch_len = kPageSize * 2;
+ std::unique_ptr<char[]> scratch(new char[scratch_len]);
+ Slice result;
+ ASSERT_OK(seq_file->Read(scratch_len, &result, scratch.get()));
+ ASSERT_EQ(kPageSize + kBlockSize, result.size());
+ ASSERT_EQ('a', result[kBlockSize - 1]);
+ ASSERT_EQ('b', result[kBlockSize]);
+}
+#endif // !ROCKSDB_LITE
+
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
+TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
+ // Create file.
+ if (env_ == Env::Default()) {
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+ IoctlFriendlyTmpdir ift;
+ if (!ift.is_supported()) {
+ ROCKSDB_GTEST_BYPASS(
+ "FS_IOC_GETVERSION is not supported by the filesystem");
+ return;
+ }
+ std::string fname = ift.name() + "/testfile";
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+ std::unique_ptr<RandomAccessFile> file;
+
+ // Get Unique ID
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+ ASSERT_TRUE(id_size > 0);
+ std::string unique_id1(temp_id, id_size);
+ ASSERT_TRUE(IsUniqueIDValid(unique_id1));
+
+ // Get Unique ID again
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+ ASSERT_TRUE(id_size > 0);
+ std::string unique_id2(temp_id, id_size);
+ ASSERT_TRUE(IsUniqueIDValid(unique_id2));
+
+ // Get Unique ID again after waiting some time.
+ env_->SleepForMicroseconds(1000000);
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+ ASSERT_TRUE(id_size > 0);
+ std::string unique_id3(temp_id, id_size);
+ ASSERT_TRUE(IsUniqueIDValid(unique_id3));
+
+ // Check IDs are the same.
+ ASSERT_EQ(unique_id1, unique_id2);
+ ASSERT_EQ(unique_id2, unique_id3);
+
+ // Delete the file
+ ASSERT_OK(env_->DeleteFile(fname));
+ }
+}
+#endif // !defined(OS_WIN)
+
+// only works in linux platforms
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+TEST_P(EnvPosixTestWithParam, AllocateTest) {
+ if (env_ == Env::Default()) {
+ SetupSyncPointsToMockDirectIO();
+ std::string fname = test::PerThreadDBPath(env_, "preallocate_testfile");
+ // Try fallocate in a file to see whether the target file system supports
+ // it.
+ // Skip the test if fallocate is not supported.
+ std::string fname_test_fallocate =
+ test::PerThreadDBPath(env_, "preallocate_testfile_2");
+ int fd = -1;
+ do {
+ fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+ } while (fd < 0 && errno == EINTR);
+ ASSERT_GT(fd, 0);
+
+ int alloc_status = fallocate(fd, 0, 0, 1);
+
+ int err_number = 0;
+ if (alloc_status != 0) {
+ err_number = errno;
+ fprintf(stderr, "Warning: fallocate() fails, %s\n",
+ errnoStr(err_number).c_str());
+ }
+ close(fd);
+ ASSERT_OK(env_->DeleteFile(fname_test_fallocate));
+ if (alloc_status != 0 && err_number == EOPNOTSUPP) {
+ // The filesystem containing the file does not support fallocate
+ return;
+ }
+
+ EnvOptions soptions;
+ soptions.use_mmap_writes = false;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+ // allocate 100 MB
+ size_t kPreallocateSize = 100 * 1024 * 1024;
+ size_t kBlockSize = 512;
+ size_t kDataSize = 1024 * 1024;
+ auto data_ptr = NewAligned(kDataSize, 'A');
+ Slice data(data_ptr.get(), kDataSize);
+ wfile->SetPreallocationBlockSize(kPreallocateSize);
+ wfile->PrepareWrite(wfile->GetFileSize(), kDataSize);
+ ASSERT_OK(wfile->Append(data));
+ ASSERT_OK(wfile->Flush());
+
+ struct stat f_stat;
+ ASSERT_EQ(stat(fname.c_str(), &f_stat), 0);
+ ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size);
+ // verify that blocks are preallocated
+ // Note here that we don't check the exact number of blocks preallocated --
+ // we only require that number of allocated blocks is at least what we
+ // expect.
+ // It looks like some FS give us more blocks that we asked for. That's fine.
+ // It might be worth investigating further.
+ ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks);
+
+ // close the file, should deallocate the blocks
+ wfile.reset();
+
+ stat(fname.c_str(), &f_stat);
+ ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size);
+ // verify that preallocated blocks were deallocated on file close
+ // Because the FS might give us more blocks, we add a full page to the size
+ // and expect the number of blocks to be less or equal to that.
+ ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize,
+ (unsigned int)f_stat.st_blocks);
+ }
+}
+#endif // ROCKSDB_FALLOCATE_PRESENT
+
+// Returns true if any of the strings in ss are the prefix of another string.
+bool HasPrefix(const std::unordered_set<std::string>& ss) {
+ for (const std::string& s : ss) {
+ if (s.empty()) {
+ return true;
+ }
+ for (size_t i = 1; i < s.size(); ++i) {
+ if (ss.count(s.substr(0, i)) != 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
+TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
+ if (env_ == Env::Default()) {
+ // Check whether a bunch of concurrently existing files have unique IDs.
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+
+ // Create the files
+ IoctlFriendlyTmpdir ift;
+ if (!ift.is_supported()) {
+ ROCKSDB_GTEST_BYPASS(
+ "FS_IOC_GETVERSION is not supported by the filesystem");
+ return;
+ }
+ std::vector<std::string> fnames;
+ for (int i = 0; i < 1000; ++i) {
+ fnames.push_back(ift.name() + "/" + "testfile" + std::to_string(i));
+
+ // Create file.
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions));
+ }
+
+ // Collect and check whether the IDs are unique.
+ std::unordered_set<std::string> ids;
+ for (const std::string& fname : fnames) {
+ std::unique_ptr<RandomAccessFile> file;
+ std::string unique_id;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+ ASSERT_TRUE(id_size > 0);
+ unique_id = std::string(temp_id, id_size);
+ ASSERT_TRUE(IsUniqueIDValid(unique_id));
+
+ ASSERT_TRUE(ids.count(unique_id) == 0);
+ ids.insert(unique_id);
+ }
+
+ // Delete the files
+ for (const std::string& fname : fnames) {
+ ASSERT_OK(env_->DeleteFile(fname));
+ }
+
+ ASSERT_TRUE(!HasPrefix(ids));
+ }
+}
+
+// TODO: Disable the flaky test, it's a known issue that ext4 may return same
+// key after file deletion. The issue is tracked in #7405, #7470.
+TEST_P(EnvPosixTestWithParam, DISABLED_RandomAccessUniqueIDDeletes) {
+ if (env_ == Env::Default()) {
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+
+ IoctlFriendlyTmpdir ift;
+ if (!ift.is_supported()) {
+ ROCKSDB_GTEST_BYPASS(
+ "FS_IOC_GETVERSION is not supported by the filesystem");
+ return;
+ }
+ std::string fname = ift.name() + "/" + "testfile";
+
+ // Check that after file is deleted we don't get same ID again in a new
+ // file.
+ std::unordered_set<std::string> ids;
+ for (int i = 0; i < 1000; ++i) {
+ // Create file.
+ {
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+ }
+
+ // Get Unique ID
+ std::string unique_id;
+ {
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+ ASSERT_TRUE(id_size > 0);
+ unique_id = std::string(temp_id, id_size);
+ }
+
+ ASSERT_TRUE(IsUniqueIDValid(unique_id));
+ ASSERT_TRUE(ids.count(unique_id) == 0);
+ ids.insert(unique_id);
+
+ // Delete the file
+ ASSERT_OK(env_->DeleteFile(fname));
+ }
+
+ ASSERT_TRUE(!HasPrefix(ids));
+ }
+}
+#endif // !defined(OS_WIN)
+
+TEST_P(EnvPosixTestWithParam, MultiRead) {
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ const size_t kSectorSize = 4096;
+ const size_t kNumSectors = 8;
+
+ // Create file.
+ {
+ std::unique_ptr<WritableFile> wfile;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ if (soptions.use_direct_writes) {
+ soptions.use_direct_writes = false;
+ }
+#endif
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+ for (size_t i = 0; i < kNumSectors; ++i) {
+ auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+ Slice slice(data.get(), kSectorSize);
+ ASSERT_OK(wfile->Append(slice));
+ }
+ ASSERT_OK(wfile->Close());
+ }
+
+ // More attempts to simulate more partial result sequences.
+ for (uint32_t attempt = 0; attempt < 20; attempt++) {
+ // Random Read
+ Random rnd(301 + attempt);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UpdateResults::io_uring_result", [&](void* arg) {
+ if (attempt > 0) {
+ // No failure in the first attempt.
+ size_t& bytes_read = *static_cast<size_t*>(arg);
+ if (rnd.OneIn(4)) {
+ bytes_read = 0;
+ } else if (rnd.OneIn(3)) {
+ bytes_read = static_cast<size_t>(
+ rnd.Uniform(static_cast<int>(bytes_read)));
+ }
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ std::unique_ptr<RandomAccessFile> file;
+ std::vector<ReadRequest> reqs(3);
+ std::vector<std::unique_ptr<char, Deleter>> data;
+ uint64_t offset = 0;
+ for (size_t i = 0; i < reqs.size(); ++i) {
+ reqs[i].offset = offset;
+ offset += 2 * kSectorSize;
+ reqs[i].len = kSectorSize;
+ data.emplace_back(NewAligned(kSectorSize, 0));
+ reqs[i].scratch = data.back().get();
+ }
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ if (soptions.use_direct_reads) {
+ soptions.use_direct_reads = false;
+ }
+#endif
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+ for (size_t i = 0; i < reqs.size(); ++i) {
+ auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i * 2 + 1));
+ ASSERT_OK(reqs[i].status);
+ ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) {
+ // In this test we don't do aligned read, so it doesn't work for
+ // direct I/O case.
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = false;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ const size_t kTotalSize = 81920;
+ Random rnd(301);
+ std::string expected_data = rnd.RandomString(kTotalSize);
+
+ // Create file.
+ {
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+ ASSERT_OK(wfile->Append(expected_data));
+ ASSERT_OK(wfile->Close());
+ }
+
+ // More attempts to simulate more partial result sequences.
+ for (uint32_t attempt = 0; attempt < 25; attempt++) {
+ // Right now kIoUringDepth is hard coded as 256, so we need very large
+ // number of keys to cover the case of multiple rounds of submissions.
+ // Right now the test latency is still acceptable. If it ends up with
+ // too long, we can modify the io uring depth with SyncPoint here.
+ const int num_reads = rnd.Uniform(512) + 1;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "UpdateResults::io_uring_result", [&](void* arg) {
+ if (attempt > 5) {
+ // Improve partial result rates in second half of the run to
+ // cover the case of repeated partial results.
+ int odd = (attempt < 15) ? num_reads / 2 : 4;
+ // No failure in first several attempts.
+ size_t& bytes_read = *static_cast<size_t*>(arg);
+ if (rnd.OneIn(odd)) {
+ bytes_read = 0;
+ } else if (rnd.OneIn(odd / 2)) {
+ bytes_read = static_cast<size_t>(
+ rnd.Uniform(static_cast<int>(bytes_read)));
+ }
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Generate (offset, len) pairs
+ std::set<int> start_offsets;
+ for (int i = 0; i < num_reads; i++) {
+ int rnd_off;
+ // No repeat offsets.
+ while (start_offsets.find(rnd_off = rnd.Uniform(81920)) !=
+ start_offsets.end()) {
+ }
+ start_offsets.insert(rnd_off);
+ }
+ std::vector<size_t> offsets;
+ std::vector<size_t> lens;
+ // std::set already sorted the offsets.
+ for (int so : start_offsets) {
+ offsets.push_back(so);
+ }
+ for (size_t i = 0; i + 1 < offsets.size(); i++) {
+ lens.push_back(static_cast<size_t>(
+ rnd.Uniform(static_cast<int>(offsets[i + 1] - offsets[i])) + 1));
+ }
+ lens.push_back(static_cast<size_t>(
+ rnd.Uniform(static_cast<int>(kTotalSize - offsets.back())) + 1));
+ ASSERT_EQ(num_reads, lens.size());
+
+ // Create requests
+ std::vector<std::string> scratches;
+ scratches.reserve(num_reads);
+ std::vector<ReadRequest> reqs(num_reads);
+ for (size_t i = 0; i < reqs.size(); ++i) {
+ reqs[i].offset = offsets[i];
+ reqs[i].len = lens[i];
+ scratches.emplace_back(reqs[i].len, ' ');
+ reqs[i].scratch = const_cast<char*>(scratches.back().data());
+ }
+
+ // Query the data
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+
+ // Validate results
+ for (int i = 0; i < num_reads; ++i) {
+ ASSERT_OK(reqs[i].status);
+ ASSERT_EQ(
+ Slice(expected_data.data() + offsets[i], lens[i]).ToString(true),
+ reqs[i].result.ToString(true));
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(EnvPosixTest, NonAlignedDirectIOMultiReadBeyondFileSize) {
+ EnvOptions soptions;
+ soptions.use_direct_reads = true;
+ soptions.use_direct_writes = false;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ Random rnd(301);
+ std::unique_ptr<WritableFile> wfile;
+ size_t alignment = 0;
+ // Create file.
+ {
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+ auto data_ptr = NewAligned(4095, 'b');
+ Slice data_b(data_ptr.get(), 4095);
+ ASSERT_OK(wfile->PositionedAppend(data_b, 0U));
+ ASSERT_OK(wfile->Close());
+ }
+
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
+ if (soptions.use_direct_reads) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ });
+ }
+#endif
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ const int num_reads = 2;
+ // Create requests
+ std::vector<std::string> scratches;
+ scratches.reserve(num_reads);
+ std::vector<ReadRequest> reqs(num_reads);
+
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ alignment = file->GetRequiredBufferAlignment();
+ ASSERT_EQ(num_reads, reqs.size());
+
+ std::vector<std::unique_ptr<char, Deleter>> data;
+
+ std::vector<size_t> offsets = {0, 2047};
+ std::vector<size_t> lens = {2047, 4096 - 2047};
+
+ for (size_t i = 0; i < num_reads; i++) {
+ // Do alignment
+ reqs[i].offset = static_cast<uint64_t>(TruncateToPageBoundary(
+ alignment, static_cast<size_t>(/*offset=*/offsets[i])));
+ reqs[i].len =
+ Roundup(static_cast<size_t>(/*offset=*/offsets[i]) + /*length=*/lens[i],
+ alignment) -
+ reqs[i].offset;
+
+ size_t new_capacity = Roundup(reqs[i].len, alignment);
+ data.emplace_back(NewAligned(new_capacity, 0));
+ reqs[i].scratch = data.back().get();
+ }
+
+ // Query the data
+ ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+
+ // Validate results
+ for (size_t i = 0; i < num_reads; ++i) {
+ ASSERT_OK(reqs[i].status);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // ROCKSDB_LITE
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+void GenerateFilesAndRequest(Env* env, const std::string& fname,
+ std::vector<ReadRequest>* ret_reqs,
+ std::vector<std::string>* scratches) {
+ const size_t kTotalSize = 81920;
+ Random rnd(301);
+ std::string expected_data = rnd.RandomString(kTotalSize);
+
+ // Create file.
+ {
+ std::unique_ptr<WritableFile> wfile;
+ ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions()));
+ ASSERT_OK(wfile->Append(expected_data));
+ ASSERT_OK(wfile->Close());
+ }
+
+ // Right now kIoUringDepth is hard coded as 256, so we need very large
+ // number of keys to cover the case of multiple rounds of submissions.
+ // Right now the test latency is still acceptable. If it ends up with
+ // too long, we can modify the io uring depth with SyncPoint here.
+ const int num_reads = 3;
+ std::vector<size_t> offsets = {10000, 20000, 30000};
+ std::vector<size_t> lens = {3000, 200, 100};
+
+ // Create requests
+ scratches->reserve(num_reads);
+ std::vector<ReadRequest>& reqs = *ret_reqs;
+ reqs.resize(num_reads);
+ for (int i = 0; i < num_reads; ++i) {
+ reqs[i].offset = offsets[i];
+ reqs[i].len = lens[i];
+ scratches->emplace_back(reqs[i].len, ' ');
+ reqs[i].scratch = const_cast<char*>(scratches->back().data());
+ }
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError) {
+ // In this test we don't do aligned read, so we can't do direct I/O.
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = false;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ std::vector<std::string> scratches;
+ std::vector<ReadRequest> reqs;
+ GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+ // Query the data
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+ bool io_uring_wait_cqe_called = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return",
+ [&](void* arg) {
+ if (!io_uring_wait_cqe_called) {
+ io_uring_wait_cqe_called = true;
+ ssize_t& ret = *(static_cast<ssize_t*>(arg));
+ ret = 1;
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s = file->MultiRead(reqs.data(), reqs.size());
+ if (io_uring_wait_cqe_called) {
+ ASSERT_NOK(s);
+ } else {
+ s.PermitUncheckedError();
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError2) {
+ // In this test we don't do aligned read, so we can't do direct I/O.
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = false;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ std::vector<std::string> scratches;
+ std::vector<ReadRequest> reqs;
+ GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+ // Query the data
+ std::unique_ptr<RandomAccessFile> file;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+ bool io_uring_submit_and_wait_called = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+ [&](void* arg) {
+ io_uring_submit_and_wait_called = true;
+ ssize_t* ret = static_cast<ssize_t*>(arg);
+ (*ret)--;
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+ [&](void* arg) {
+ struct io_uring* iu = static_cast<struct io_uring*>(arg);
+ struct io_uring_cqe* cqe;
+ assert(io_uring_wait_cqe(iu, &cqe) == 0);
+ io_uring_cqe_seen(iu, cqe);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s = file->MultiRead(reqs.data(), reqs.size());
+ if (io_uring_submit_and_wait_called) {
+ ASSERT_NOK(s);
+ } else {
+ s.PermitUncheckedError();
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // ROCKSDB_IOURING_PRESENT
+
+// Only works in linux platforms
+#ifdef OS_WIN
+TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) {
+#else
+TEST_P(EnvPosixTestWithParam, InvalidateCache) {
+#endif
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ const size_t kSectorSize = 512;
+ auto data = NewAligned(kSectorSize, 0);
+ Slice slice(data.get(), kSectorSize);
+
+ // Create file.
+ {
+ std::unique_ptr<WritableFile> wfile;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ if (soptions.use_direct_writes) {
+ soptions.use_direct_writes = false;
+ }
+#endif
+ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+ ASSERT_OK(wfile->Append(slice));
+ ASSERT_OK(wfile->InvalidateCache(0, 0));
+ ASSERT_OK(wfile->Close());
+ }
+
+ // Random Read
+ {
+ std::unique_ptr<RandomAccessFile> file;
+ auto scratch = NewAligned(kSectorSize, 0);
+ Slice result;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ if (soptions.use_direct_reads) {
+ soptions.use_direct_reads = false;
+ }
+#endif
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+ ASSERT_OK(file->Read(0, kSectorSize, &result, scratch.get()));
+ ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0);
+ ASSERT_OK(file->InvalidateCache(0, 11));
+ ASSERT_OK(file->InvalidateCache(0, 0));
+ }
+
+ // Sequential Read
+ {
+ std::unique_ptr<SequentialFile> file;
+ auto scratch = NewAligned(kSectorSize, 0);
+ Slice result;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX)
+ if (soptions.use_direct_reads) {
+ soptions.use_direct_reads = false;
+ }
+#endif
+ ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions));
+ if (file->use_direct_io()) {
+ ASSERT_OK(file->PositionedRead(0, kSectorSize, &result, scratch.get()));
+ } else {
+ ASSERT_OK(file->Read(kSectorSize, &result, scratch.get()));
+ }
+ ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0);
+ ASSERT_OK(file->InvalidateCache(0, 11));
+ ASSERT_OK(file->InvalidateCache(0, 0));
+ }
+ // Delete the file
+ ASSERT_OK(env_->DeleteFile(fname));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+}
+#endif // OS_LINUX || OS_WIN
+
+class TestLogger : public Logger {
+ public:
+ using Logger::Logv;
+ void Logv(const char* format, va_list ap) override {
+ log_count++;
+
+ char new_format[550];
+ std::fill_n(new_format, sizeof(new_format), '2');
+ {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+ // 48 bytes for extra information + bytes allocated
+
+// When we have n == -1 there is not a terminating zero expected
+#ifdef OS_WIN
+ if (n < 0) {
+ char_0_count++;
+ }
+#endif
+
+ if (new_format[0] == '[') {
+ // "[DEBUG] "
+ ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(port::TimeVal))));
+ } else {
+ ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(port::TimeVal))));
+ }
+ va_end(backup_ap);
+ }
+
+ for (size_t i = 0; i < sizeof(new_format); i++) {
+ if (new_format[i] == 'x') {
+ char_x_count++;
+ } else if (new_format[i] == '\0') {
+ char_0_count++;
+ }
+ }
+ }
+ int log_count;
+ int char_x_count;
+ int char_0_count;
+};
+
+TEST_P(EnvPosixTestWithParam, LogBufferTest) {
+ TestLogger test_logger;
+ test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+ test_logger.log_count = 0;
+ test_logger.char_x_count = 0;
+ test_logger.char_0_count = 0;
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+ LogBuffer log_buffer_debug(DEBUG_LEVEL, &test_logger);
+
+ char bytes200[200];
+ std::fill_n(bytes200, sizeof(bytes200), '1');
+ bytes200[sizeof(bytes200) - 1] = '\0';
+ char bytes600[600];
+ std::fill_n(bytes600, sizeof(bytes600), '1');
+ bytes600[sizeof(bytes600) - 1] = '\0';
+ char bytes9000[9000];
+ std::fill_n(bytes9000, sizeof(bytes9000), '1');
+ bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+ ROCKS_LOG_BUFFER(&log_buffer, "x%sx", bytes200);
+ ROCKS_LOG_BUFFER(&log_buffer, "x%sx", bytes600);
+ ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx%sx", bytes200, bytes200, bytes200);
+ ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx", bytes200, bytes600);
+ ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx", bytes600, bytes9000);
+
+ ROCKS_LOG_BUFFER(&log_buffer_debug, "x%sx", bytes200);
+ test_logger.SetInfoLogLevel(DEBUG_LEVEL);
+ ROCKS_LOG_BUFFER(&log_buffer_debug, "x%sx%sx%sx", bytes600, bytes9000,
+ bytes200);
+
+ ASSERT_EQ(0, test_logger.log_count);
+ log_buffer.FlushBufferToLog();
+ log_buffer_debug.FlushBufferToLog();
+ ASSERT_EQ(6, test_logger.log_count);
+ ASSERT_EQ(6, test_logger.char_0_count);
+ ASSERT_EQ(10, test_logger.char_x_count);
+}
+
+class TestLogger2 : public Logger {
+ public:
+ explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {}
+ using Logger::Logv;
+ void Logv(const char* format, va_list ap) override {
+ char new_format[2000];
+ std::fill_n(new_format, sizeof(new_format), '2');
+ {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+ // 48 bytes for extra information + bytes allocated
+ ASSERT_TRUE(n <=
+ 48 + static_cast<int>(max_log_size_ - sizeof(port::TimeVal)));
+ ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(port::TimeVal)));
+ va_end(backup_ap);
+ }
+ }
+ size_t max_log_size_;
+};
+
+TEST_P(EnvPosixTestWithParam, LogBufferMaxSizeTest) {
+ char bytes9000[9000];
+ std::fill_n(bytes9000, sizeof(bytes9000), '1');
+ bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+ for (size_t max_log_size = 256; max_log_size <= 1024;
+ max_log_size += 1024 - 256) {
+ TestLogger2 test_logger(max_log_size);
+ test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+ ROCKS_LOG_BUFFER_MAX_SZ(&log_buffer, max_log_size, "%s", bytes9000);
+ log_buffer.FlushBufferToLog();
+ }
+}
+
+TEST_P(EnvPosixTestWithParam, Preallocation) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ const std::string src = test::PerThreadDBPath(env_, "testfile");
+ std::unique_ptr<WritableFile> srcfile;
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
+ if (soptions.use_direct_writes) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewWritableFile:O_DIRECT", [&](void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ });
+ }
+#endif
+ ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+ srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+ // No writes should mean no preallocation
+ size_t block_size, last_allocated_block;
+ srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+ ASSERT_EQ(last_allocated_block, 0UL);
+
+ // Small write should preallocate one block
+ size_t kStrSize = 4096;
+ auto data = NewAligned(kStrSize, 'A');
+ Slice str(data.get(), kStrSize);
+ srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize);
+ ASSERT_OK(srcfile->Append(str));
+ srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+ ASSERT_EQ(last_allocated_block, 1UL);
+
+ // Write an entire preallocation block, make sure we increased by two.
+ {
+ auto buf_ptr = NewAligned(block_size, ' ');
+ Slice buf(buf_ptr.get(), block_size);
+ srcfile->PrepareWrite(srcfile->GetFileSize(), block_size);
+ ASSERT_OK(srcfile->Append(buf));
+ srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+ ASSERT_EQ(last_allocated_block, 2UL);
+ }
+
+ // Write five more blocks at once, ensure we're where we need to be.
+ {
+ auto buf_ptr = NewAligned(block_size * 5, ' ');
+ Slice buf = Slice(buf_ptr.get(), block_size * 5);
+ srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size());
+ ASSERT_OK(srcfile->Append(buf));
+ srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+ ASSERT_EQ(last_allocated_block, 7UL);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+}
+
+// Test that the two ways to get children file attributes (in bulk or
+// individually) behave consistently.
+TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ EnvOptions soptions;
+ soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+ const int kNumChildren = 10;
+
+ std::string data;
+ std::string test_base_dir = test::PerThreadDBPath(env_, "env_test_chr_attr");
+ env_->CreateDir(test_base_dir).PermitUncheckedError();
+ for (int i = 0; i < kNumChildren; ++i) {
+ const std::string path = test_base_dir + "/testfile_" + std::to_string(i);
+ std::unique_ptr<WritableFile> file;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+ !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
+ if (soptions.use_direct_writes) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewWritableFile:O_DIRECT", [&](void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ });
+ }
+#endif
+ ASSERT_OK(env_->NewWritableFile(path, &file, soptions));
+ auto buf_ptr = NewAligned(data.size(), 'T');
+ Slice buf(buf_ptr.get(), data.size());
+ ASSERT_OK(file->Append(buf));
+ data.append(std::string(4096, 'T'));
+ }
+
+ std::vector<Env::FileAttributes> file_attrs;
+ ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs));
+ for (int i = 0; i < kNumChildren; ++i) {
+ const std::string name = "testfile_" + std::to_string(i);
+ const std::string path = test_base_dir + "/" + name;
+
+ auto file_attrs_iter = std::find_if(
+ file_attrs.begin(), file_attrs.end(),
+ [&name](const Env::FileAttributes& fm) { return fm.name == name; });
+ ASSERT_TRUE(file_attrs_iter != file_attrs.end());
+ uint64_t size;
+ ASSERT_OK(env_->GetFileSize(path, &size));
+ ASSERT_EQ(size, 4096 * i);
+ ASSERT_EQ(size, file_attrs_iter->size_bytes);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+}
+
+// Test that all WritableFileWrapper forwards all calls to WritableFile.
+TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
+ class Base : public WritableFile {
+ public:
+ mutable int* step_;
+
+ void inc(int x) const { EXPECT_EQ(x, (*step_)++); }
+
+ explicit Base(int* step) : step_(step) { inc(0); }
+
+ Status Append(const Slice& /*data*/) override {
+ inc(1);
+ return Status::OK();
+ }
+
+ Status Append(
+ const Slice& /*data*/,
+ const DataVerificationInfo& /* verification_info */) override {
+ inc(1);
+ return Status::OK();
+ }
+
+ Status PositionedAppend(const Slice& /*data*/,
+ uint64_t /*offset*/) override {
+ inc(2);
+ return Status::OK();
+ }
+
+ Status PositionedAppend(
+ const Slice& /*data*/, uint64_t /*offset*/,
+ const DataVerificationInfo& /* verification_info */) override {
+ inc(2);
+ return Status::OK();
+ }
+
+ Status Truncate(uint64_t /*size*/) override {
+ inc(3);
+ return Status::OK();
+ }
+
+ Status Close() override {
+ inc(4);
+ return Status::OK();
+ }
+
+ Status Flush() override {
+ inc(5);
+ return Status::OK();
+ }
+
+ Status Sync() override {
+ inc(6);
+ return Status::OK();
+ }
+
+ Status Fsync() override {
+ inc(7);
+ return Status::OK();
+ }
+
+ bool IsSyncThreadSafe() const override {
+ inc(8);
+ return true;
+ }
+
+ bool use_direct_io() const override {
+ inc(9);
+ return true;
+ }
+
+ size_t GetRequiredBufferAlignment() const override {
+ inc(10);
+ return 0;
+ }
+
+ void SetIOPriority(Env::IOPriority /*pri*/) override { inc(11); }
+
+ Env::IOPriority GetIOPriority() override {
+ inc(12);
+ return Env::IOPriority::IO_LOW;
+ }
+
+ void SetWriteLifeTimeHint(Env::WriteLifeTimeHint /*hint*/) override {
+ inc(13);
+ }
+
+ Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+ inc(14);
+ return Env::WriteLifeTimeHint::WLTH_NOT_SET;
+ }
+
+ uint64_t GetFileSize() override {
+ inc(15);
+ return 0;
+ }
+
+ void SetPreallocationBlockSize(size_t /*size*/) override { inc(16); }
+
+ void GetPreallocationStatus(size_t* /*block_size*/,
+ size_t* /*last_allocated_block*/) override {
+ inc(17);
+ }
+
+ size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+ inc(18);
+ return 0;
+ }
+
+ Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+ inc(19);
+ return Status::OK();
+ }
+
+ Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override {
+ inc(20);
+ return Status::OK();
+ }
+
+ void PrepareWrite(size_t /*offset*/, size_t /*len*/) override { inc(21); }
+
+ Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+ inc(22);
+ return Status::OK();
+ }
+
+ public:
+ ~Base() override { inc(23); }
+ };
+
+ class Wrapper : public WritableFileWrapper {
+ public:
+ explicit Wrapper(WritableFile* target) : WritableFileWrapper(target) {}
+ };
+
+ int step = 0;
+
+ {
+ Base b(&step);
+ Wrapper w(&b);
+ ASSERT_OK(w.Append(Slice()));
+ ASSERT_OK(w.PositionedAppend(Slice(), 0));
+ ASSERT_OK(w.Truncate(0));
+ ASSERT_OK(w.Close());
+ ASSERT_OK(w.Flush());
+ ASSERT_OK(w.Sync());
+ ASSERT_OK(w.Fsync());
+ w.IsSyncThreadSafe();
+ w.use_direct_io();
+ w.GetRequiredBufferAlignment();
+ w.SetIOPriority(Env::IOPriority::IO_HIGH);
+ w.GetIOPriority();
+ w.SetWriteLifeTimeHint(Env::WriteLifeTimeHint::WLTH_NOT_SET);
+ w.GetWriteLifeTimeHint();
+ w.GetFileSize();
+ w.SetPreallocationBlockSize(0);
+ w.GetPreallocationStatus(nullptr, nullptr);
+ w.GetUniqueId(nullptr, 0);
+ ASSERT_OK(w.InvalidateCache(0, 0));
+ ASSERT_OK(w.RangeSync(0, 0));
+ w.PrepareWrite(0, 0);
+ ASSERT_OK(w.Allocate(0, 0));
+ }
+
+ EXPECT_EQ(24, step);
+}
+
+TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) {
+ const std::string path = test::PerThreadDBPath(env_, "random_rw_file");
+
+ env_->DeleteFile(path).PermitUncheckedError();
+
+ std::unique_ptr<RandomRWFile> file;
+
+ // Cannot open non-existing file.
+ ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+ // Create the file using WritableFile
+ {
+ std::unique_ptr<WritableFile> wf;
+ ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions()));
+ }
+
+ ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+ char buf[10000];
+ Slice read_res;
+
+ ASSERT_OK(file->Write(0, "ABCD"));
+ ASSERT_OK(file->Read(0, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ABCD");
+
+ ASSERT_OK(file->Write(2, "XXXX"));
+ ASSERT_OK(file->Read(0, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ABXXXX");
+
+ ASSERT_OK(file->Write(10, "ZZZ"));
+ ASSERT_OK(file->Read(10, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ZZZ");
+
+ ASSERT_OK(file->Write(11, "Y"));
+ ASSERT_OK(file->Read(10, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ZYZ");
+
+ ASSERT_OK(file->Write(200, "FFFFF"));
+ ASSERT_OK(file->Read(200, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "FFFFF");
+
+ ASSERT_OK(file->Write(205, "XXXX"));
+ ASSERT_OK(file->Read(200, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "FFFFFXXXX");
+
+ ASSERT_OK(file->Write(5, "QQQQ"));
+ ASSERT_OK(file->Read(0, 9, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ABXXXQQQQ");
+
+ ASSERT_OK(file->Read(2, 4, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "XXXQ");
+
+ // Close file and reopen it
+ ASSERT_OK(file->Close());
+ ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+ ASSERT_OK(file->Read(0, 9, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ABXXXQQQQ");
+
+ ASSERT_OK(file->Read(10, 3, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ZYZ");
+
+ ASSERT_OK(file->Read(200, 9, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "FFFFFXXXX");
+
+ ASSERT_OK(file->Write(4, "TTTTTTTTTTTTTTTT"));
+ ASSERT_OK(file->Read(0, 10, &read_res, buf));
+ ASSERT_EQ(read_res.ToString(), "ABXXTTTTTT");
+
+ // Clean up
+ ASSERT_OK(env_->DeleteFile(path));
+}
+
+class RandomRWFileWithMirrorString {
+ public:
+ explicit RandomRWFileWithMirrorString(RandomRWFile* _file) : file_(_file) {}
+
+ void Write(size_t offset, const std::string& data) {
+ // Write to mirror string
+ StringWrite(offset, data);
+
+ // Write to file
+ Status s = file_->Write(offset, data);
+ ASSERT_OK(s) << s.ToString();
+ }
+
+ void Read(size_t offset = 0, size_t n = 1000000) {
+ Slice str_res(nullptr, 0);
+ if (offset < file_mirror_.size()) {
+ size_t str_res_sz = std::min(file_mirror_.size() - offset, n);
+ str_res = Slice(file_mirror_.data() + offset, str_res_sz);
+ StopSliceAtNull(&str_res);
+ }
+
+ Slice file_res;
+ Status s = file_->Read(offset, n, &file_res, buf_);
+ ASSERT_OK(s) << s.ToString();
+ StopSliceAtNull(&file_res);
+
+ ASSERT_EQ(str_res.ToString(), file_res.ToString()) << offset << " " << n;
+ }
+
+ void SetFile(RandomRWFile* _file) { file_ = _file; }
+
+ private:
+ void StringWrite(size_t offset, const std::string& src) {
+ if (offset + src.size() > file_mirror_.size()) {
+ file_mirror_.resize(offset + src.size(), '\0');
+ }
+
+ char* pos = const_cast<char*>(file_mirror_.data() + offset);
+ memcpy(pos, src.data(), src.size());
+ }
+
+ void StopSliceAtNull(Slice* slc) {
+ for (size_t i = 0; i < slc->size(); i++) {
+ if ((*slc)[i] == '\0') {
+ *slc = Slice(slc->data(), i);
+ break;
+ }
+ }
+ }
+
+ char buf_[10000];
+ RandomRWFile* file_;
+ std::string file_mirror_;
+};
+
+TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) {
+ const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand");
+ env_->DeleteFile(path).PermitUncheckedError();
+
+ std::unique_ptr<RandomRWFile> file;
+
+#ifdef OS_LINUX
+ // Cannot open non-existing file.
+ ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+#endif
+
+ // Create the file using WritableFile
+ {
+ std::unique_ptr<WritableFile> wf;
+ ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions()));
+ }
+
+ ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+ RandomRWFileWithMirrorString file_with_mirror(file.get());
+
+ Random rnd(301);
+ std::string buf;
+ for (int i = 0; i < 10000; i++) {
+ // Genrate random data
+ buf = rnd.RandomString(10);
+
+ // Pick random offset for write
+ size_t write_off = rnd.Next() % 1000;
+ file_with_mirror.Write(write_off, buf);
+
+ // Pick random offset for read
+ size_t read_off = rnd.Next() % 1000;
+ size_t read_sz = rnd.Next() % 20;
+ file_with_mirror.Read(read_off, read_sz);
+
+ if (i % 500 == 0) {
+ // Reopen the file every 500 iters
+ ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+ file_with_mirror.SetFile(file.get());
+ }
+ }
+
+ // clean up
+ ASSERT_OK(env_->DeleteFile(path));
+}
+
+class TestEnv : public EnvWrapper {
+ public:
+ explicit TestEnv() : EnvWrapper(Env::Default()), close_count(0) {}
+ const char* Name() const override { return "TestEnv"; }
+ class TestLogger : public Logger {
+ public:
+ using Logger::Logv;
+ explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+ ~TestLogger() override {
+ if (!closed_) {
+ Status s = CloseHelper();
+ s.PermitUncheckedError();
+ }
+ }
+ void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+ protected:
+ Status CloseImpl() override { return CloseHelper(); }
+
+ private:
+ Status CloseHelper() {
+ env->CloseCountInc();
+ return Status::OK();
+ }
+ TestEnv* env;
+ };
+
+ void CloseCountInc() { close_count++; }
+
+ int GetCloseCount() { return close_count; }
+
+ Status NewLogger(const std::string& /*fname*/,
+ std::shared_ptr<Logger>* result) override {
+ result->reset(new TestLogger(this));
+ return Status::OK();
+ }
+
+ private:
+ int close_count;
+};
+
+class EnvTest : public testing::Test {
+ public:
+ EnvTest() : test_directory_(test::PerThreadDBPath("env_test")) {}
+
+ protected:
+ const std::string test_directory_;
+};
+
+TEST_F(EnvTest, Close) {
+ TestEnv* env = new TestEnv();
+ std::shared_ptr<Logger> logger;
+ Status s;
+
+ s = env->NewLogger("", &logger);
+ ASSERT_OK(s);
+ ASSERT_OK(logger.get()->Close());
+ ASSERT_EQ(env->GetCloseCount(), 1);
+ // Call Close() again. CloseHelper() should not be called again
+ ASSERT_OK(logger.get()->Close());
+ ASSERT_EQ(env->GetCloseCount(), 1);
+ logger.reset();
+ ASSERT_EQ(env->GetCloseCount(), 1);
+
+ s = env->NewLogger("", &logger);
+ ASSERT_OK(s);
+ logger.reset();
+ ASSERT_EQ(env->GetCloseCount(), 2);
+
+ delete env;
+}
+
+class LogvWithInfoLogLevelLogger : public Logger {
+ public:
+ using Logger::Logv;
+ void Logv(const InfoLogLevel /* log_level */, const char* /* format */,
+ va_list /* ap */) override {}
+};
+
+TEST_F(EnvTest, LogvWithInfoLogLevel) {
+ // Verifies the log functions work on a `Logger` that only overrides the
+ // `Logv()` overload including `InfoLogLevel`.
+ const std::string kSampleMessage("sample log message");
+ LogvWithInfoLogLevelLogger logger;
+ ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+ ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+ ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+ ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+ ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+ ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+}
+
+INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam,
+ ::testing::Values(std::pair<Env*, bool>(Env::Default(),
+ false)));
+#if !defined(ROCKSDB_LITE)
+INSTANTIATE_TEST_CASE_P(DefaultEnvWithDirectIO, EnvPosixTestWithParam,
+ ::testing::Values(std::pair<Env*, bool>(Env::Default(),
+ true)));
+#endif // !defined(ROCKSDB_LITE)
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+static Env* GetChrootEnv() {
+ static std::unique_ptr<Env> chroot_env(
+ NewChrootEnv(Env::Default(), test::TmpDir(Env::Default())));
+ return chroot_env.get();
+}
+INSTANTIATE_TEST_CASE_P(ChrootEnvWithoutDirectIO, EnvPosixTestWithParam,
+ ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(),
+ false)));
+INSTANTIATE_TEST_CASE_P(ChrootEnvWithDirectIO, EnvPosixTestWithParam,
+ ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(),
+ true)));
+#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+class EnvFSTestWithParam
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+ EnvFSTestWithParam() {
+ bool env_non_null = std::get<0>(GetParam());
+ bool env_default = std::get<1>(GetParam());
+ bool fs_default = std::get<2>(GetParam());
+
+ env_ = env_non_null ? (env_default ? Env::Default() : nullptr) : nullptr;
+ fs_ = fs_default
+ ? FileSystem::Default()
+ : std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+ if (env_non_null && env_default && !fs_default) {
+ env_ptr_ = NewCompositeEnv(fs_);
+ }
+ if (env_non_null && !env_default && fs_default) {
+ env_ptr_ =
+ std::unique_ptr<Env>(new FaultInjectionTestEnv(Env::Default()));
+ fs_.reset();
+ }
+ if (env_non_null && !env_default && !fs_default) {
+ env_ptr_.reset(new FaultInjectionTestEnv(Env::Default()));
+ composite_env_ptr_.reset(new CompositeEnvWrapper(env_ptr_.get(), fs_));
+ env_ = composite_env_ptr_.get();
+ } else {
+ env_ = env_ptr_.get();
+ }
+
+ dbname1_ = test::PerThreadDBPath("env_fs_test1");
+ dbname2_ = test::PerThreadDBPath("env_fs_test2");
+ }
+
+ ~EnvFSTestWithParam() = default;
+
+ Env* env_;
+ std::unique_ptr<Env> env_ptr_;
+ std::unique_ptr<Env> composite_env_ptr_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname1_;
+ std::string dbname2_;
+};
+
+TEST_P(EnvFSTestWithParam, OptionsTest) {
+ Options opts;
+ opts.env = env_;
+ opts.create_if_missing = true;
+ std::string dbname = dbname1_;
+
+ if (env_) {
+ if (fs_) {
+ ASSERT_EQ(fs_.get(), env_->GetFileSystem().get());
+ } else {
+ ASSERT_NE(FileSystem::Default().get(), env_->GetFileSystem().get());
+ }
+ }
+ for (int i = 0; i < 2; ++i) {
+ DB* db;
+ Status s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+
+ WriteOptions wo;
+ ASSERT_OK(db->Put(wo, "a", "a"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(db->Put(wo, "b", "b"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ std::string val;
+ ASSERT_OK(db->Get(ReadOptions(), "a", &val));
+ ASSERT_EQ("a", val);
+ ASSERT_OK(db->Get(ReadOptions(), "b", &val));
+ ASSERT_EQ("b", val);
+
+ ASSERT_OK(db->Close());
+ delete db;
+ ASSERT_OK(DestroyDB(dbname, opts));
+
+ dbname = dbname2_;
+ }
+}
+
+// The parameters are as follows -
+// 1. True means Options::env is non-null, false means null
+// 2. True means use Env::Default, false means custom
+// 3. True means use FileSystem::Default, false means custom
+INSTANTIATE_TEST_CASE_P(EnvFSTest, EnvFSTestWithParam,
+ ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+ ::testing::Bool()));
+// This test ensures that default Env and those allocated by
+// NewCompositeEnv() all share the same threadpool
+TEST_F(EnvTest, MultipleCompositeEnv) {
+ std::shared_ptr<FaultInjectionTestFS> fs1 =
+ std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+ std::shared_ptr<FaultInjectionTestFS> fs2 =
+ std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+ std::unique_ptr<Env> env1 = NewCompositeEnv(fs1);
+ std::unique_ptr<Env> env2 = NewCompositeEnv(fs2);
+ Env::Default()->SetBackgroundThreads(8, Env::HIGH);
+ Env::Default()->SetBackgroundThreads(16, Env::LOW);
+ ASSERT_EQ(env1->GetBackgroundThreads(Env::LOW), 16);
+ ASSERT_EQ(env1->GetBackgroundThreads(Env::HIGH), 8);
+ ASSERT_EQ(env2->GetBackgroundThreads(Env::LOW), 16);
+ ASSERT_EQ(env2->GetBackgroundThreads(Env::HIGH), 8);
+}
+
+TEST_F(EnvTest, IsDirectory) {
+ Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+ ASSERT_OK(s);
+ const std::string test_sub_dir = test_directory_ + "sub1";
+ const std::string test_file_path = test_directory_ + "file1";
+ ASSERT_OK(Env::Default()->CreateDirIfMissing(test_sub_dir));
+ bool is_dir = false;
+ ASSERT_OK(Env::Default()->IsDirectory(test_sub_dir, &is_dir));
+ ASSERT_TRUE(is_dir);
+ {
+ std::unique_ptr<FSWritableFile> wfile;
+ s = Env::Default()->GetFileSystem()->NewWritableFile(
+ test_file_path, FileOptions(), &wfile, /*dbg=*/nullptr);
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> fwriter;
+ fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path,
+ FileOptions(),
+ SystemClock::Default().get()));
+ constexpr char buf[] = "test";
+ s = fwriter->Append(buf);
+ ASSERT_OK(s);
+ }
+ ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir));
+ ASSERT_FALSE(is_dir);
+}
+
+TEST_F(EnvTest, EnvWriteVerificationTest) {
+ Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+ const std::string test_file_path = test_directory_ + "file1";
+ ASSERT_OK(s);
+ std::shared_ptr<FaultInjectionTestFS> fault_fs(
+ new FaultInjectionTestFS(FileSystem::Default()));
+ fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ std::unique_ptr<WritableFile> file;
+ s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions());
+ ASSERT_OK(s);
+
+ DataVerificationInfo v_info;
+ std::string test_data = "test";
+ std::string checksum;
+ uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size());
+ PutFixed32(&checksum, v_crc32c);
+ v_info.checksum = Slice(checksum);
+ s = file->Append(Slice(test_data), v_info);
+ ASSERT_OK(s);
+}
+
+class CreateEnvTest : public testing::Test {
+ public:
+ CreateEnvTest() {
+ config_options_.ignore_unknown_options = false;
+ config_options_.ignore_unsupported_options = false;
+ }
+ ConfigOptions config_options_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(CreateEnvTest, LoadCTRProvider) {
+ config_options_.invoke_prepare_options = false;
+ std::string CTR = CTREncryptionProvider::kClassName();
+ std::shared_ptr<EncryptionProvider> provider;
+ // Test a provider with no cipher
+ ASSERT_OK(
+ EncryptionProvider::CreateFromString(config_options_, CTR, &provider));
+ ASSERT_NE(provider, nullptr);
+ ASSERT_EQ(provider->Name(), CTR);
+ ASSERT_NOK(provider->PrepareOptions(config_options_));
+ ASSERT_NOK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ auto cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+ ASSERT_NE(cipher, nullptr);
+ ASSERT_EQ(cipher->get(), nullptr);
+ provider.reset();
+
+ ASSERT_OK(EncryptionProvider::CreateFromString(config_options_,
+ CTR + "://test", &provider));
+ ASSERT_NE(provider, nullptr);
+ ASSERT_EQ(provider->Name(), CTR);
+ ASSERT_OK(provider->PrepareOptions(config_options_));
+ ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+ ASSERT_NE(cipher, nullptr);
+ ASSERT_NE(cipher->get(), nullptr);
+ ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+ provider.reset();
+
+ ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "1://test",
+ &provider));
+ ASSERT_NE(provider, nullptr);
+ ASSERT_EQ(provider->Name(), CTR);
+ ASSERT_OK(provider->PrepareOptions(config_options_));
+ ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+ ASSERT_NE(cipher, nullptr);
+ ASSERT_NE(cipher->get(), nullptr);
+ ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+ provider.reset();
+
+ ASSERT_OK(EncryptionProvider::CreateFromString(
+ config_options_, "id=" + CTR + "; cipher=ROT13", &provider));
+ ASSERT_NE(provider, nullptr);
+ ASSERT_EQ(provider->Name(), CTR);
+ cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+ ASSERT_NE(cipher, nullptr);
+ ASSERT_NE(cipher->get(), nullptr);
+ ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+ provider.reset();
+}
+
+TEST_F(CreateEnvTest, LoadROT13Cipher) {
+ std::shared_ptr<BlockCipher> cipher;
+ // Test a provider with no cipher
+ ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &cipher));
+ ASSERT_NE(cipher, nullptr);
+ ASSERT_STREQ(cipher->Name(), "ROT13");
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(CreateEnvTest, CreateDefaultSystemClock) {
+ std::shared_ptr<SystemClock> clock, copy;
+ ASSERT_OK(SystemClock::CreateFromString(config_options_,
+ SystemClock::kDefaultName(), &clock));
+ ASSERT_NE(clock, nullptr);
+ ASSERT_EQ(clock, SystemClock::Default());
+#ifndef ROCKSDB_LITE
+ std::string opts_str = clock->ToString(config_options_);
+ std::string mismatch;
+ ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(clock->AreEquivalent(config_options_, copy.get(), &mismatch));
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CreateEnvTest, CreateMockSystemClock) {
+ std::shared_ptr<SystemClock> mock, copy;
+
+ config_options_.registry->AddLibrary("test")->AddFactory<SystemClock>(
+ MockSystemClock::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new MockSystemClock(nullptr));
+ return guard->get();
+ });
+ ASSERT_OK(SystemClock::CreateFromString(
+ config_options_, EmulatedSystemClock::kClassName(), &mock));
+ ASSERT_NE(mock, nullptr);
+ ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName());
+ ASSERT_EQ(mock->Inner(), SystemClock::Default().get());
+ std::string opts_str = mock->ToString(config_options_);
+ std::string mismatch;
+ ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+ std::string id = std::string("id=") + EmulatedSystemClock::kClassName() +
+ ";target=" + MockSystemClock::kClassName();
+
+ ASSERT_OK(SystemClock::CreateFromString(config_options_, id, &mock));
+ ASSERT_NE(mock, nullptr);
+ ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName());
+ ASSERT_NE(mock->Inner(), nullptr);
+ ASSERT_STREQ(mock->Inner()->Name(), MockSystemClock::kClassName());
+ ASSERT_EQ(mock->Inner()->Inner(), SystemClock::Default().get());
+ opts_str = mock->ToString(config_options_);
+ ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch));
+ ASSERT_OK(SystemClock::CreateFromString(
+ config_options_, EmulatedSystemClock::kClassName(), &mock));
+}
+
+TEST_F(CreateEnvTest, CreateReadOnlyFileSystem) {
+ std::shared_ptr<FileSystem> fs, copy;
+
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_, ReadOnlyFileSystem::kClassName(), &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+ std::string opts_str = fs->ToString(config_options_);
+ std::string mismatch;
+
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_,
+ std::string("id=") + ReadOnlyFileSystem::kClassName() +
+ "; target=" + TimedFileSystem::kClassName(),
+ &fs));
+ ASSERT_NE(fs, nullptr);
+ opts_str = fs->ToString(config_options_);
+ ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName());
+ ASSERT_NE(fs->Inner(), nullptr);
+ ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateTimedFileSystem) {
+ std::shared_ptr<FileSystem> fs, copy;
+
+ ASSERT_OK(FileSystem::CreateFromString(config_options_,
+ TimedFileSystem::kClassName(), &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+ std::string opts_str = fs->ToString(config_options_);
+ std::string mismatch;
+
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_,
+ std::string("id=") + TimedFileSystem::kClassName() +
+ "; target=" + ReadOnlyFileSystem::kClassName(),
+ &fs));
+ ASSERT_NE(fs, nullptr);
+ opts_str = fs->ToString(config_options_);
+ ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+ ASSERT_NE(fs->Inner(), nullptr);
+ ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateCountedFileSystem) {
+ std::shared_ptr<FileSystem> fs, copy;
+
+ ASSERT_OK(FileSystem::CreateFromString(config_options_,
+ CountedFileSystem::kClassName(), &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), CountedFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+ std::string opts_str = fs->ToString(config_options_);
+ std::string mismatch;
+
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_,
+ std::string("id=") + CountedFileSystem::kClassName() +
+ "; target=" + ReadOnlyFileSystem::kClassName(),
+ &fs));
+ ASSERT_NE(fs, nullptr);
+ opts_str = fs->ToString(config_options_);
+ ASSERT_STREQ(fs->Name(), CountedFileSystem::kClassName());
+ ASSERT_NE(fs->Inner(), nullptr);
+ ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+#ifndef OS_WIN
+TEST_F(CreateEnvTest, CreateChrootFileSystem) {
+ std::shared_ptr<FileSystem> fs, copy;
+ auto tmp_dir = test::TmpDir(Env::Default());
+ // The Chroot FileSystem has a required "chroot_dir" option.
+ ASSERT_NOK(FileSystem::CreateFromString(config_options_,
+ ChrootFileSystem::kClassName(), &fs));
+
+ // ChrootFileSystem fails with an invalid directory
+ ASSERT_NOK(FileSystem::CreateFromString(
+ config_options_,
+ std::string("chroot_dir=/No/Such/Directory; id=") +
+ ChrootFileSystem::kClassName(),
+ &fs));
+ std::string chroot_opts = std::string("chroot_dir=") + tmp_dir +
+ std::string("; id=") +
+ ChrootFileSystem::kClassName();
+
+ // Create a valid ChrootFileSystem with an inner Default
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, chroot_opts, &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+ std::string opts_str = fs->ToString(config_options_);
+ std::string mismatch;
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+ // Create a valid ChrootFileSystem with an inner TimedFileSystem
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_,
+ chroot_opts + "; target=" + TimedFileSystem::kClassName(), &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName());
+ ASSERT_NE(fs->Inner(), nullptr);
+ ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+ opts_str = fs->ToString(config_options_);
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+ // Create a TimedFileSystem with an inner ChrootFileSystem
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_,
+ "target={" + chroot_opts + "}; id=" + TimedFileSystem::kClassName(),
+ &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+ ASSERT_NE(fs->Inner(), nullptr);
+ ASSERT_STREQ(fs->Inner()->Name(), ChrootFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+ opts_str = fs->ToString(config_options_);
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+#endif // OS_WIN
+
+TEST_F(CreateEnvTest, CreateEncryptedFileSystem) {
+ std::shared_ptr<FileSystem> fs, copy;
+
+ std::string base_opts =
+ std::string("provider=1://test; id=") + EncryptedFileSystem::kClassName();
+ // The EncryptedFileSystem requires a "provider" option.
+ ASSERT_NOK(FileSystem::CreateFromString(
+ config_options_, EncryptedFileSystem::kClassName(), &fs));
+
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, base_opts, &fs));
+
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+ std::string opts_str = fs->ToString(config_options_);
+ std::string mismatch;
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+ ASSERT_OK(FileSystem::CreateFromString(
+ config_options_, base_opts + "; target=" + TimedFileSystem::kClassName(),
+ &fs));
+ ASSERT_NE(fs, nullptr);
+ ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName());
+ ASSERT_NE(fs->Inner(), nullptr);
+ ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+ ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+ opts_str = fs->ToString(config_options_);
+ ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+#endif // ROCKSDB_LITE
+
+namespace {
+
+constexpr size_t kThreads = 8;
+constexpr size_t kIdsPerThread = 1000;
+
+// This is a mini-stress test to check for duplicates in functions like
+// GenerateUniqueId()
+template <typename IdType, class Hash = std::hash<IdType>>
+struct NoDuplicateMiniStressTest {
+ std::unordered_set<IdType, Hash> ids;
+ std::mutex mutex;
+ Env* env;
+
+ NoDuplicateMiniStressTest() { env = Env::Default(); }
+
+ virtual ~NoDuplicateMiniStressTest() {}
+
+ void Run() {
+ std::array<std::thread, kThreads> threads;
+ for (size_t i = 0; i < kThreads; ++i) {
+ threads[i] = std::thread([&]() { ThreadFn(); });
+ }
+ for (auto& thread : threads) {
+ thread.join();
+ }
+ // All must be unique
+ ASSERT_EQ(ids.size(), kThreads * kIdsPerThread);
+ }
+
+ void ThreadFn() {
+ std::array<IdType, kIdsPerThread> my_ids;
+ // Generate in parallel threads as fast as possible
+ for (size_t i = 0; i < kIdsPerThread; ++i) {
+ my_ids[i] = Generate();
+ }
+ // Now collate
+ std::lock_guard<std::mutex> lock(mutex);
+ for (auto& id : my_ids) {
+ ids.insert(id);
+ }
+ }
+
+ virtual IdType Generate() = 0;
+};
+
+void VerifyRfcUuids(const std::unordered_set<std::string>& uuids) {
+ if (uuids.empty()) {
+ return;
+ }
+}
+
+using uint64_pair_t = std::pair<uint64_t, uint64_t>;
+struct HashUint64Pair {
+ std::size_t operator()(
+ std::pair<uint64_t, uint64_t> const& u) const noexcept {
+ // Assume suitable distribution already
+ return static_cast<size_t>(u.first ^ u.second);
+ }
+};
+
+} // namespace
+
+TEST_F(EnvTest, GenerateUniqueId) {
+ struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+ std::string Generate() override { return env->GenerateUniqueId(); }
+ };
+
+ MyStressTest t;
+ t.Run();
+
+ // Basically verify RFC-4122 format
+ for (auto& uuid : t.ids) {
+ ASSERT_EQ(36U, uuid.size());
+ ASSERT_EQ('-', uuid[8]);
+ ASSERT_EQ('-', uuid[13]);
+ ASSERT_EQ('-', uuid[18]);
+ ASSERT_EQ('-', uuid[23]);
+ }
+}
+
+TEST_F(EnvTest, GenerateDbSessionId) {
+ struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+ std::string Generate() override { return DBImpl::GenerateDbSessionId(env); }
+ };
+
+ MyStressTest t;
+ t.Run();
+
+ // Basically verify session ID
+ for (auto& id : t.ids) {
+ ASSERT_EQ(20U, id.size());
+ }
+}
+
+constexpr bool kRequirePortGenerateRfcUuid =
+#if defined(OS_LINUX) || defined(OS_ANDROID) || defined(OS_WIN)
+ true;
+#else
+ false;
+#endif
+
+TEST_F(EnvTest, PortGenerateRfcUuid) {
+ if (!kRequirePortGenerateRfcUuid) {
+ ROCKSDB_GTEST_SKIP("Not supported/expected on this platform");
+ return;
+ }
+ struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+ std::string Generate() override {
+ std::string u;
+ assert(port::GenerateRfcUuid(&u));
+ return u;
+ }
+ };
+
+ MyStressTest t;
+ t.Run();
+
+ // Extra verification on versions and variants
+ VerifyRfcUuids(t.ids);
+}
+
+// Test the atomic, linear generation of GenerateRawUuid
+TEST_F(EnvTest, GenerateRawUniqueId) {
+ struct MyStressTest
+ : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+ uint64_pair_t Generate() override {
+ uint64_pair_t p;
+ GenerateRawUniqueId(&p.first, &p.second);
+ return p;
+ }
+ };
+
+ MyStressTest t;
+ t.Run();
+}
+
+// Test that each entropy source ("track") is at least adequate
+TEST_F(EnvTest, GenerateRawUniqueIdTrackPortUuidOnly) {
+ if (!kRequirePortGenerateRfcUuid) {
+ ROCKSDB_GTEST_SKIP("Not supported/expected on this platform");
+ return;
+ }
+
+ struct MyStressTest
+ : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+ uint64_pair_t Generate() override {
+ uint64_pair_t p;
+ TEST_GenerateRawUniqueId(&p.first, &p.second, false, true, true);
+ return p;
+ }
+ };
+
+ MyStressTest t;
+ t.Run();
+}
+
+TEST_F(EnvTest, GenerateRawUniqueIdTrackEnvDetailsOnly) {
+ struct MyStressTest
+ : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+ uint64_pair_t Generate() override {
+ uint64_pair_t p;
+ TEST_GenerateRawUniqueId(&p.first, &p.second, true, false, true);
+ return p;
+ }
+ };
+
+ MyStressTest t;
+ t.Run();
+}
+
+TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) {
+ struct MyStressTest
+ : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+ uint64_pair_t Generate() override {
+ uint64_pair_t p;
+ TEST_GenerateRawUniqueId(&p.first, &p.second, true, true, false);
+ return p;
+ }
+ };
+
+ MyStressTest t;
+ t.Run();
+}
+
+TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) {
+ // Must be thread safe and usable as a static
+ static SemiStructuredUniqueIdGen gen;
+
+ struct MyStressTest
+ : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+ uint64_pair_t Generate() override {
+ uint64_pair_t p;
+ gen.GenerateNext(&p.first, &p.second);
+ return p;
+ }
+ };
+
+ MyStressTest t;
+ t.Run();
+}
+
+TEST_F(EnvTest, FailureToCreateLockFile) {
+ auto env = Env::Default();
+ auto fs = env->GetFileSystem();
+ std::string dir = test::PerThreadDBPath(env, "lockdir");
+ std::string file = dir + "/lockfile";
+
+ // Ensure directory doesn't exist
+ ASSERT_OK(DestroyDir(env, dir));
+
+ // Make sure that we can acquire a file lock after the first attempt fails
+ FileLock* lock = nullptr;
+ ASSERT_NOK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr));
+ ASSERT_FALSE(lock);
+
+ ASSERT_OK(fs->CreateDir(dir, IOOptions(), /*dbg*/ nullptr));
+ ASSERT_OK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr));
+ ASSERT_OK(fs->UnlockFile(lock, IOOptions(), /*dbg*/ nullptr));
+
+ // Clean up
+ ASSERT_OK(DestroyDir(env, dir));
+}
+
+TEST_F(CreateEnvTest, CreateDefaultEnv) {
+ ConfigOptions options;
+ options.ignore_unsupported_options = false;
+
+ std::shared_ptr<Env> guard;
+ Env* env = nullptr;
+ ASSERT_OK(Env::CreateFromString(options, "", &env));
+ ASSERT_EQ(env, Env::Default());
+
+ env = nullptr;
+ ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env));
+ ASSERT_EQ(env, Env::Default());
+
+ env = nullptr;
+ ASSERT_OK(Env::CreateFromString(options, "", &env, &guard));
+ ASSERT_EQ(env, Env::Default());
+ ASSERT_EQ(guard, nullptr);
+
+ env = nullptr;
+ ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env, &guard));
+ ASSERT_EQ(env, Env::Default());
+ ASSERT_EQ(guard, nullptr);
+
+#ifndef ROCKSDB_LITE
+ std::string opt_str = env->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env));
+ ASSERT_EQ(env, Env::Default());
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+ ASSERT_EQ(env, Env::Default());
+ ASSERT_EQ(guard, nullptr);
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class WrappedEnv : public EnvWrapper {
+ public:
+ explicit WrappedEnv(Env* t) : EnvWrapper(t) {}
+ explicit WrappedEnv(const std::shared_ptr<Env>& t) : EnvWrapper(t) {}
+ static const char* kClassName() { return "WrappedEnv"; }
+ const char* Name() const override { return kClassName(); }
+ static void Register(ObjectLibrary& lib, const std::string& /*arg*/) {
+ lib.AddFactory<Env>(
+ WrappedEnv::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new WrappedEnv(nullptr));
+ return guard->get();
+ });
+ }
+};
+} // namespace
+TEST_F(CreateEnvTest, CreateMockEnv) {
+ ConfigOptions options;
+ options.ignore_unsupported_options = false;
+ WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+ std::shared_ptr<Env> guard, copy;
+ std::string opt_str;
+
+ Env* env = nullptr;
+ ASSERT_NOK(Env::CreateFromString(options, MockEnv::kClassName(), &env));
+ ASSERT_OK(
+ Env::CreateFromString(options, MockEnv::kClassName(), &env, &guard));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ opt_str = env->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(copy, guard);
+ std::string mismatch;
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+ guard.reset(MockEnv::Create(Env::Default(), SystemClock::Default()));
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ std::unique_ptr<Env> wrapped_env(new WrappedEnv(Env::Default()));
+ guard.reset(MockEnv::Create(wrapped_env.get(), SystemClock::Default()));
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ opt_str = copy->ToString(options);
+}
+
+TEST_F(CreateEnvTest, CreateWrappedEnv) {
+ ConfigOptions options;
+ options.ignore_unsupported_options = false;
+ WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+ Env* env = nullptr;
+ std::shared_ptr<Env> guard, copy;
+ std::string opt_str;
+ std::string mismatch;
+
+ ASSERT_NOK(Env::CreateFromString(options, WrappedEnv::kClassName(), &env));
+ ASSERT_OK(
+ Env::CreateFromString(options, WrappedEnv::kClassName(), &env, &guard));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ ASSERT_FALSE(guard->AreEquivalent(options, Env::Default(), &mismatch));
+
+ opt_str = env->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(copy, guard);
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+ guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(Env::Default())));
+ ASSERT_NE(guard.get(), env);
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(copy, guard);
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+ guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(
+ std::make_shared<WrappedEnv>(Env::Default()))));
+ ASSERT_NE(guard.get(), env);
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(copy, guard);
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateCompositeEnv) {
+ ConfigOptions options;
+ options.ignore_unsupported_options = false;
+ std::shared_ptr<Env> guard, copy;
+ Env* env = nullptr;
+ std::string mismatch, opt_str;
+
+ WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+ std::unique_ptr<Env> base(NewCompositeEnv(FileSystem::Default()));
+ std::unique_ptr<Env> wrapped(new WrappedEnv(Env::Default()));
+ std::shared_ptr<FileSystem> timed_fs =
+ std::make_shared<TimedFileSystem>(FileSystem::Default());
+ std::shared_ptr<SystemClock> clock =
+ std::make_shared<EmulatedSystemClock>(SystemClock::Default());
+
+ opt_str = base->ToString(options);
+ ASSERT_NOK(Env::CreateFromString(options, opt_str, &env));
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ ASSERT_EQ(env->GetFileSystem(), FileSystem::Default());
+ ASSERT_EQ(env->GetSystemClock(), SystemClock::Default());
+
+ base = NewCompositeEnv(timed_fs);
+ opt_str = base->ToString(options);
+ ASSERT_NOK(Env::CreateFromString(options, opt_str, &env));
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ ASSERT_NE(env->GetFileSystem(), FileSystem::Default());
+ ASSERT_EQ(env->GetSystemClock(), SystemClock::Default());
+
+ env = nullptr;
+ guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs));
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+ env = nullptr;
+ guard.reset(new CompositeEnvWrapper(wrapped.get(), clock));
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+ env = nullptr;
+ guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs, clock));
+ opt_str = guard->ToString(options);
+ ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+ ASSERT_NE(env, nullptr);
+ ASSERT_NE(env, Env::Default());
+ ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+ guard.reset(new CompositeEnvWrapper(nullptr, timed_fs, clock));
+ ColumnFamilyOptions cf_opts;
+ DBOptions db_opts;
+ db_opts.env = guard.get();
+ auto comp = db_opts.env->CheckedCast<CompositeEnvWrapper>();
+ ASSERT_NE(comp, nullptr);
+ ASSERT_EQ(comp->Inner(), nullptr);
+ ASSERT_NOK(ValidateOptions(db_opts, cf_opts));
+ ASSERT_OK(db_opts.env->PrepareOptions(options));
+ ASSERT_NE(comp->Inner(), nullptr);
+ ASSERT_OK(ValidateOptions(db_opts, cf_opts));
+}
+#endif // ROCKSDB_LITE
+
+// Forward declaration
+class ReadAsyncFS;
+
+struct MockIOHandle {
+ std::function<void(const FSReadRequest&, void*)> cb;
+ void* cb_arg;
+ bool create_io_error;
+};
+
+// ReadAsyncFS and ReadAsyncRandomAccessFile mocks the FS doing asynchronous
+// reads by creating threads that submit read requests and then calling Poll API
+// to obtain those results.
+class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+ ReadAsyncRandomAccessFile(ReadAsyncFS& fs,
+ std::unique_ptr<FSRandomAccessFile>& file)
+ : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+ IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb,
+ void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+ IODebugContext* dbg) override;
+
+ private:
+ ReadAsyncFS& fs_;
+ std::unique_ptr<FSRandomAccessFile> file_;
+ int counter = 0;
+};
+
+class ReadAsyncFS : public FileSystemWrapper {
+ public:
+ explicit ReadAsyncFS(const std::shared_ptr<FileSystem>& wrapped)
+ : FileSystemWrapper(wrapped) {}
+
+ static const char* kClassName() { return "ReadAsyncFS"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override {
+ std::unique_ptr<FSRandomAccessFile> file;
+ IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+ EXPECT_OK(s);
+ result->reset(new ReadAsyncRandomAccessFile(*this, file));
+ return s;
+ }
+
+ IOStatus Poll(std::vector<void*>& io_handles,
+ size_t /*min_completions*/) override {
+ // Wait for the threads completion.
+ for (auto& t : workers) {
+ t.join();
+ }
+
+ for (size_t i = 0; i < io_handles.size(); i++) {
+ MockIOHandle* handle = static_cast<MockIOHandle*>(io_handles[i]);
+ if (handle->create_io_error) {
+ FSReadRequest req;
+ req.status = IOStatus::IOError();
+ handle->cb(req, handle->cb_arg);
+ }
+ }
+ return IOStatus::OK();
+ }
+
+ std::vector<std::thread> workers;
+};
+
+IOStatus ReadAsyncRandomAccessFile::ReadAsync(
+ FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+ void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+ IOHandleDeleter deletefn = [](void* args) -> void {
+ delete (static_cast<MockIOHandle*>(args));
+ args = nullptr;
+ };
+ *del_fn = deletefn;
+
+ // Allocate and populate io_handle.
+ MockIOHandle* mock_handle = new MockIOHandle();
+ bool create_io_error = false;
+ if (counter % 2) {
+ create_io_error = true;
+ }
+ mock_handle->create_io_error = create_io_error;
+ mock_handle->cb = cb;
+ mock_handle->cb_arg = cb_arg;
+ *io_handle = static_cast<void*>(mock_handle);
+ counter++;
+
+ // Submit read request asynchronously.
+ std::function<void(FSReadRequest)> submit_request =
+ [&opts, cb, cb_arg, dbg, create_io_error, this](FSReadRequest _req) {
+ if (!create_io_error) {
+ _req.status = target()->Read(_req.offset, _req.len, opts,
+ &(_req.result), _req.scratch, dbg);
+ cb(_req, cb_arg);
+ }
+ };
+
+ fs_.workers.emplace_back(submit_request, req);
+ return IOStatus::OK();
+}
+
+class TestAsyncRead : public testing::Test {
+ public:
+ TestAsyncRead() { env_ = Env::Default(); }
+ Env* env_;
+};
+
+// Tests the default implementation of ReadAsync API.
+TEST_F(TestAsyncRead, ReadAsync) {
+ EnvOptions soptions;
+ std::shared_ptr<ReadAsyncFS> fs =
+ std::make_shared<ReadAsyncFS>(env_->GetFileSystem());
+
+ std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+ const size_t kSectorSize = 4096;
+ const size_t kNumSectors = 8;
+
+ // 1. create & write to a file.
+ {
+ std::unique_ptr<FSWritableFile> wfile;
+ ASSERT_OK(
+ fs->NewWritableFile(fname, FileOptions(), &wfile, nullptr /*dbg*/));
+
+ for (size_t i = 0; i < kNumSectors; ++i) {
+ auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+ Slice slice(data.get(), kSectorSize);
+ ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+ }
+ ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+ }
+ // 2. Read file
+ {
+ std::unique_ptr<FSRandomAccessFile> file;
+ ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr));
+
+ IOOptions opts;
+ std::vector<void*> io_handles(kNumSectors);
+ std::vector<FSReadRequest> reqs(kNumSectors);
+ std::vector<std::unique_ptr<char, Deleter>> data;
+ std::vector<size_t> vals;
+ IOHandleDeleter del_fn;
+ uint64_t offset = 0;
+
+ // Initialize read requests
+ for (size_t i = 0; i < kNumSectors; i++) {
+ reqs[i].offset = offset;
+ reqs[i].len = kSectorSize;
+ data.emplace_back(NewAligned(kSectorSize, 0));
+ reqs[i].scratch = data.back().get();
+ vals.push_back(i);
+ offset += kSectorSize;
+ }
+
+ // callback function passed to async read.
+ std::function<void(const FSReadRequest&, void*)> callback =
+ [&](const FSReadRequest& req, void* cb_arg) {
+ assert(cb_arg != nullptr);
+ size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+ reqs[i].offset = req.offset;
+ reqs[i].result = req.result;
+ reqs[i].status = req.status;
+ };
+
+ // Submit asynchronous read requests.
+ for (size_t i = 0; i < kNumSectors; i++) {
+ void* cb_arg = static_cast<void*>(&(vals[i]));
+ ASSERT_OK(file->ReadAsync(reqs[i], opts, callback, cb_arg,
+ &(io_handles[i]), &del_fn, nullptr));
+ }
+
+ // Poll for the submitted requests.
+ fs->Poll(io_handles, kNumSectors);
+
+ // Check the status of read requests.
+ for (size_t i = 0; i < kNumSectors; i++) {
+ if (i % 2) {
+ ASSERT_EQ(reqs[i].status, IOStatus::IOError());
+ } else {
+ auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+ Slice expected_data(buf.get(), kSectorSize);
+
+ ASSERT_EQ(reqs[i].offset, i * kSectorSize);
+ ASSERT_OK(reqs[i].status);
+ ASSERT_EQ(expected_data.ToString(), reqs[i].result.ToString());
+ }
+ }
+
+ // Delete io_handles.
+ for (size_t i = 0; i < io_handles.size(); i++) {
+ del_fn(io_handles[i]);
+ }
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/file_system.cc b/src/rocksdb/env/file_system.cc
new file mode 100644
index 000000000..f9dda429a
--- /dev/null
+++ b/src/rocksdb/env/file_system.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/file_system.h"
+
+#include "env/composite_env_wrapper.h"
+#include "env/env_chroot.h"
+#include "env/env_encryption_ctr.h"
+#include "env/fs_readonly.h"
+#include "env/mock_env.h"
+#include "logging/env_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+#include "utilities/counted_fs.h"
+#include "utilities/env_timed.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileSystem::FileSystem() {}
+
+FileSystem::~FileSystem() {}
+
+Status FileSystem::Load(const std::string& value,
+ std::shared_ptr<FileSystem>* result) {
+ return CreateFromString(ConfigOptions(), value, result);
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinFileSystems(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<FileSystem>(
+ TimedFileSystem::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new TimedFileSystem(nullptr));
+ return guard->get();
+ });
+ library.AddFactory<FileSystem>(
+ ReadOnlyFileSystem::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new ReadOnlyFileSystem(nullptr));
+ return guard->get();
+ });
+ library.AddFactory<FileSystem>(
+ EncryptedFileSystem::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+ std::string* errmsg) {
+ Status s = NewEncryptedFileSystemImpl(nullptr, nullptr, guard);
+ if (!s.ok()) {
+ *errmsg = s.ToString();
+ }
+ return guard->get();
+ });
+ library.AddFactory<FileSystem>(
+ CountedFileSystem::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+ std::string* /*errmsg*/) {
+ guard->reset(new CountedFileSystem(FileSystem::Default()));
+ return guard->get();
+ });
+ library.AddFactory<FileSystem>(
+ MockFileSystem::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+ std::string* /*errmsg*/) {
+ guard->reset(new MockFileSystem(SystemClock::Default()));
+ return guard->get();
+ });
+#ifndef OS_WIN
+ library.AddFactory<FileSystem>(
+ ChrootFileSystem::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new ChrootFileSystem(nullptr, ""));
+ return guard->get();
+ });
+#endif // OS_WIN
+ size_t num_types;
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif // ROCKSDB_LITE
+
+Status FileSystem::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<FileSystem>* result) {
+ auto default_fs = FileSystem::Default();
+ if (default_fs->IsInstanceOf(value)) {
+ *result = default_fs;
+ return Status::OK();
+ } else {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ return LoadSharedObject<FileSystem>(config_options, value, nullptr, result);
+ }
+}
+
+IOStatus FileSystem::ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) {
+ IOStatus s = RenameFile(old_fname, fname, opts.io_options, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ return NewWritableFile(fname, opts, result, dbg);
+}
+
+IOStatus FileSystem::NewLogger(const std::string& fname,
+ const IOOptions& io_opts,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) {
+ FileOptions options;
+ options.io_options = io_opts;
+ // TODO: Tune the buffer size.
+ options.writable_file_max_buffer_size = 1024 * 1024;
+ std::unique_ptr<FSWritableFile> writable_file;
+ const IOStatus status = NewWritableFile(fname, options, &writable_file, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+
+ *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+ options, Env::Default());
+ return IOStatus::OK();
+}
+
+FileOptions FileSystem::OptimizeForLogRead(
+ const FileOptions& file_options) const {
+ FileOptions optimized_file_options(file_options);
+ optimized_file_options.use_direct_reads = false;
+ return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForManifestRead(
+ const FileOptions& file_options) const {
+ FileOptions optimized_file_options(file_options);
+ optimized_file_options.use_direct_reads = false;
+ return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const {
+ FileOptions optimized_file_options(file_options);
+ optimized_file_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+ optimized_file_options.writable_file_max_buffer_size =
+ db_options.writable_file_max_buffer_size;
+ return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForManifestWrite(
+ const FileOptions& file_options) const {
+ return file_options;
+}
+
+FileOptions FileSystem::OptimizeForCompactionTableWrite(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const {
+ FileOptions optimized_file_options(file_options);
+ optimized_file_options.use_direct_writes =
+ db_options.use_direct_io_for_flush_and_compaction;
+ return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForCompactionTableRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const {
+ FileOptions optimized_file_options(file_options);
+ optimized_file_options.use_direct_reads = db_options.use_direct_reads;
+ return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForBlobFileRead(
+ const FileOptions& file_options,
+ const ImmutableDBOptions& db_options) const {
+ FileOptions optimized_file_options(file_options);
+ optimized_file_options.use_direct_reads = db_options.use_direct_reads;
+ return optimized_file_options;
+}
+
+IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+ const std::string& fname, bool should_sync) {
+ std::unique_ptr<FSWritableFile> file;
+ EnvOptions soptions;
+ IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ s = file->Append(data, IOOptions(), nullptr);
+ if (s.ok() && should_sync) {
+ s = file->Sync(IOOptions(), nullptr);
+ }
+ if (!s.ok()) {
+ fs->DeleteFile(fname, IOOptions(), nullptr);
+ }
+ return s;
+}
+
+IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+ std::string* data) {
+ FileOptions soptions;
+ data->clear();
+ std::unique_ptr<FSSequentialFile> file;
+ IOStatus s = status_to_io_status(
+ fs->NewSequentialFile(fname, soptions, &file, nullptr));
+ if (!s.ok()) {
+ return s;
+ }
+ static const int kBufferSize = 8192;
+ char* space = new char[kBufferSize];
+ while (true) {
+ Slice fragment;
+ s = file->Read(kBufferSize, IOOptions(), &fragment, space, nullptr);
+ if (!s.ok()) {
+ break;
+ }
+ data->append(fragment.data(), fragment.size());
+ if (fragment.empty()) {
+ break;
+ }
+ }
+ delete[] space;
+ return s;
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> fs_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+ {"target",
+ OptionTypeInfo::AsCustomSharedPtr<FileSystem>(
+ 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)},
+#endif // ROCKSDB_LITE
+};
+} // namespace
+FileSystemWrapper::FileSystemWrapper(const std::shared_ptr<FileSystem>& t)
+ : target_(t) {
+ RegisterOptions("", &target_, &fs_wrapper_type_info);
+}
+
+Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) {
+ if (target_ == nullptr) {
+ target_ = FileSystem::Default();
+ }
+ return FileSystem::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string FileSystemWrapper::SerializeOptions(
+ const ConfigOptions& config_options, const std::string& header) const {
+ auto parent = FileSystem::SerializeOptions(config_options, "");
+ if (config_options.IsShallow() || target_ == nullptr ||
+ target_->IsInstanceOf(FileSystem::kDefaultName())) {
+ return parent;
+ } else {
+ std::string result = header;
+ if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+ result.append(OptionTypeInfo::kIdPropName()).append("=");
+ }
+ result.append(parent);
+ if (!EndsWith(result, config_options.delimiter)) {
+ result.append(config_options.delimiter);
+ }
+ result.append("target=").append(target_->ToString(config_options));
+ return result;
+ }
+}
+#endif // ROCKSDB_LITE
+
+DirFsyncOptions::DirFsyncOptions() { reason = kDefault; }
+
+DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) {
+ reason = kFileRenamed;
+ renamed_new_name = file_renamed_new_name;
+}
+
+DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) {
+ assert(fsync_reason != kFileRenamed);
+ reason = fsync_reason;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/file_system_tracer.cc b/src/rocksdb/env/file_system_tracer.cc
new file mode 100644
index 000000000..d0c45c57e
--- /dev/null
+++ b/src/rocksdb/env/file_system_tracer.cc
@@ -0,0 +1,564 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "env/file_system_tracer.h"
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus FileSystemTracingWrapper::NewSequentialFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomAccessFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewWritableFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReopenWritableFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReuseWritableFile(
+ const std::string& fname, const std::string& old_fname,
+ const FileOptions& file_opts, std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s =
+ target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomRWFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewDirectory(
+ const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->NewDirectory(name, io_opts, result, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ name.substr(name.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir,
+ const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->GetChildren(dir, io_opts, r, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ dir.substr(dir.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->DeleteFile(fname, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->CreateDir(dirname, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ dirname.substr(dirname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::CreateDirIfMissing(
+ const std::string& dirname, const IOOptions& options, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ dirname.substr(dirname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->DeleteDir(dirname, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ dirname.substr(dirname.find_last_of("/\\") + 1));
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_size,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->GetFileSize(fname, options, file_size, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOFileSize);
+ IOTraceRecord io_record(
+ clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed,
+ s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname,
+ size_t size,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Truncate(fname, size, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOFileSize);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(),
+ fname.substr(fname.find_last_of("/\\") + 1), size);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::Read(size_t n,
+ const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Read(n, options, result, scratch, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_,
+ result->size(), 0 /*Offset*/);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset,
+ size_t length) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->InvalidateCache(offset, length);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, length,
+ offset);
+ io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+ return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::PositionedRead(
+ uint64_t offset, size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s =
+ target()->PositionedRead(offset, n, options, result, scratch, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_,
+ result->size(), offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n,
+ const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, n,
+ offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs,
+ size_t num_reqs,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t latency = elapsed;
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ for (size_t i = 0; i < num_reqs; i++) {
+ IOTraceRecord io_record(
+ clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency,
+ reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ }
+ return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Prefetch(offset, n, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, n,
+ offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset,
+ size_t length) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->InvalidateCache(offset, length);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, length,
+ static_cast<uint64_t>(offset));
+ io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+ return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::ReadAsync(
+ FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+ void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+ // Create a callback and populate info.
+ auto read_async_callback =
+ std::bind(&FSRandomAccessFileTracingWrapper::ReadAsyncCallback, this,
+ std::placeholders::_1, std::placeholders::_2);
+ ReadAsyncCallbackInfo* read_async_cb_info = new ReadAsyncCallbackInfo;
+ read_async_cb_info->cb_ = cb;
+ read_async_cb_info->cb_arg_ = cb_arg;
+ read_async_cb_info->start_time_ = clock_->NowNanos();
+ read_async_cb_info->file_op_ = __func__;
+
+ IOStatus s = target()->ReadAsync(req, opts, read_async_callback,
+ read_async_cb_info, io_handle, del_fn, dbg);
+
+ if (!s.ok()) {
+ delete read_async_cb_info;
+ }
+ return s;
+}
+
+void FSRandomAccessFileTracingWrapper::ReadAsyncCallback(
+ const FSReadRequest& req, void* cb_arg) {
+ ReadAsyncCallbackInfo* read_async_cb_info =
+ static_cast<ReadAsyncCallbackInfo*>(cb_arg);
+ assert(read_async_cb_info);
+ assert(read_async_cb_info->cb_);
+
+ uint64_t elapsed = clock_->NowNanos() - read_async_cb_info->start_time_;
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ read_async_cb_info->file_op_, elapsed,
+ req.status.ToString(), file_name_, req.result.size(),
+ req.offset);
+ io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+
+ // call the underlying callback.
+ read_async_cb_info->cb_(req, read_async_cb_info->cb_arg_);
+ delete read_async_cb_info;
+}
+
+IOStatus FSWritableFileTracingWrapper::Append(const Slice& data,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Append(data, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_,
+ data.size(), 0 /*Offset*/);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::PositionedAppend(
+ const Slice& data, uint64_t offset, const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->PositionedAppend(data, offset, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_,
+ data.size(), offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Truncate(size, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, size,
+ 0 /*Offset*/);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Close(options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ file_name_);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ uint64_t file_size = target()->GetFileSize(options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOFileSize);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, "OK", file_name_, file_size);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return file_size;
+}
+
+IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset,
+ size_t length) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->InvalidateCache(offset, length);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, length,
+ static_cast<uint64_t>(offset));
+ io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+ return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Write(offset, data, options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_,
+ data.size(), offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n,
+ const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ uint64_t io_op_data = 0;
+ io_op_data |= (1 << IOTraceOp::kIOLen);
+ io_op_data |= (1 << IOTraceOp::kIOOffset);
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+ __func__, elapsed, s.ToString(), file_name_, n,
+ offset);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Flush(options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ file_name_);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Close(options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ file_name_);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Sync(options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ file_name_);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options,
+ IODebugContext* dbg) {
+ StopWatchNano timer(clock_);
+ timer.Start();
+ IOStatus s = target()->Fsync(options, dbg);
+ uint64_t elapsed = timer.ElapsedNanos();
+ IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+ 0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+ file_name_);
+ io_tracer_->WriteIOOp(io_record, dbg);
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/file_system_tracer.h b/src/rocksdb/env/file_system_tracer.h
new file mode 100644
index 000000000..979a0bf12
--- /dev/null
+++ b/src/rocksdb/env/file_system_tracer.h
@@ -0,0 +1,461 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "trace_replay/io_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FileSystemTracingWrapper is a wrapper class above FileSystem that forwards
+// the call to the underlying storage system. It then invokes IOTracer to record
+// file operations and other contextual information in a binary format for
+// tracing. It overrides methods we are interested in tracing and extends
+// FileSystemWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FileSystemTracingWrapper : public FileSystemWrapper {
+ public:
+ FileSystemTracingWrapper(const std::shared_ptr<FileSystem>& t,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : FileSystemWrapper(t),
+ io_tracer_(io_tracer),
+ clock_(SystemClock::Default().get()) {}
+
+ ~FileSystemTracingWrapper() override {}
+
+ static const char* kClassName() { return "FileSystemTracing"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+ uint64_t* file_size, IODebugContext* dbg) override;
+
+ IOStatus Truncate(const std::string& fname, size_t size,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ SystemClock* clock_;
+};
+
+// The FileSystemPtr is a wrapper class that takes pointer to storage systems
+// (such as posix filesystems). It overloads operator -> and returns a pointer
+// of either FileSystem or FileSystemTracingWrapper based on whether tracing is
+// enabled or not. It is added to bypass FileSystemTracingWrapper when tracing
+// is disabled.
+class FileSystemPtr {
+ public:
+ FileSystemPtr(std::shared_ptr<FileSystem> fs,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : fs_(fs), io_tracer_(io_tracer) {
+ fs_tracer_ = std::make_shared<FileSystemTracingWrapper>(fs_, io_tracer_);
+ }
+
+ std::shared_ptr<FileSystem> operator->() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return fs_tracer_;
+ } else {
+ return fs_;
+ }
+ }
+
+ /* Returns the underlying File System pointer */
+ FileSystem* get() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return fs_tracer_.get();
+ } else {
+ return fs_.get();
+ }
+ }
+
+ private:
+ std::shared_ptr<FileSystem> fs_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ std::shared_ptr<FileSystemTracingWrapper> fs_tracer_;
+};
+
+// FSSequentialFileTracingWrapper is a wrapper class above FSSequentialFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSSequentialFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSSequentialFileTracingWrapper : public FSSequentialFileOwnerWrapper {
+ public:
+ FSSequentialFileTracingWrapper(std::unique_ptr<FSSequentialFile>&& t,
+ std::shared_ptr<IOTracer> io_tracer,
+ const std::string& file_name)
+ : FSSequentialFileOwnerWrapper(std::move(t)),
+ io_tracer_(io_tracer),
+ clock_(SystemClock::Default().get()),
+ file_name_(file_name) {}
+
+ ~FSSequentialFileTracingWrapper() override {}
+
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override;
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override;
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ SystemClock* clock_;
+ std::string file_name_;
+};
+
+// The FSSequentialFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSSequentialFile or FSSequentialFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSSequentialFileTracingWrapper when tracing is disabled.
+class FSSequentialFilePtr {
+ public:
+ FSSequentialFilePtr() = delete;
+ FSSequentialFilePtr(std::unique_ptr<FSSequentialFile>&& fs,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& file_name)
+ : io_tracer_(io_tracer),
+ fs_tracer_(std::move(fs), io_tracer_,
+ file_name.substr(file_name.find_last_of("/\\") +
+ 1) /* pass file name */) {}
+
+ FSSequentialFile* operator->() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_);
+ } else {
+ return fs_tracer_.target();
+ }
+ }
+
+ FSSequentialFile* get() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_);
+ } else {
+ return fs_tracer_.target();
+ }
+ }
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ FSSequentialFileTracingWrapper fs_tracer_;
+};
+
+// FSRandomAccessFileTracingWrapper is a wrapper class above FSRandomAccessFile
+// that forwards the call to the underlying storage system. It then invokes
+// IOTracer to record file operations and other contextual information in a
+// binary format for tracing. It overrides methods we are interested in tracing
+// and extends FSRandomAccessFileWrapper, which forwards all methods that are
+// not explicitly overridden.
+class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper {
+ public:
+ FSRandomAccessFileTracingWrapper(std::unique_ptr<FSRandomAccessFile>&& t,
+ std::shared_ptr<IOTracer> io_tracer,
+ const std::string& file_name)
+ : FSRandomAccessFileOwnerWrapper(std::move(t)),
+ io_tracer_(io_tracer),
+ clock_(SystemClock::Default().get()),
+ file_name_(file_name) {}
+
+ ~FSRandomAccessFileTracingWrapper() override {}
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb,
+ void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+ IODebugContext* dbg) override;
+
+ void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg);
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ SystemClock* clock_;
+ // Stores file name instead of full path.
+ std::string file_name_;
+
+ struct ReadAsyncCallbackInfo {
+ uint64_t start_time_;
+ std::function<void(const FSReadRequest&, void*)> cb_;
+ void* cb_arg_;
+ std::string file_op_;
+ };
+};
+
+// The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSRandomAccessFile or FSRandomAccessFileTracingWrapper
+// based on whether tracing is enabled or not. It is added to bypass
+// FSRandomAccessFileTracingWrapper when tracing is disabled.
+class FSRandomAccessFilePtr {
+ public:
+ FSRandomAccessFilePtr(std::unique_ptr<FSRandomAccessFile>&& fs,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& file_name)
+ : io_tracer_(io_tracer),
+ fs_tracer_(std::move(fs), io_tracer_,
+ file_name.substr(file_name.find_last_of("/\\") +
+ 1) /* pass file name */) {}
+
+ FSRandomAccessFile* operator->() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_);
+ } else {
+ return fs_tracer_.target();
+ }
+ }
+
+ FSRandomAccessFile* get() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_);
+ } else {
+ return fs_tracer_.target();
+ }
+ }
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ FSRandomAccessFileTracingWrapper fs_tracer_;
+};
+
+// FSWritableFileTracingWrapper is a wrapper class above FSWritableFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSWritableFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSWritableFileTracingWrapper : public FSWritableFileOwnerWrapper {
+ public:
+ FSWritableFileTracingWrapper(std::unique_ptr<FSWritableFile>&& t,
+ std::shared_ptr<IOTracer> io_tracer,
+ const std::string& file_name)
+ : FSWritableFileOwnerWrapper(std::move(t)),
+ io_tracer_(io_tracer),
+ clock_(SystemClock::Default().get()),
+ file_name_(file_name) {}
+
+ ~FSWritableFileTracingWrapper() override {}
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& /*verification_info*/,
+ IODebugContext* dbg) override {
+ return Append(data, options, dbg);
+ }
+
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ const DataVerificationInfo& /*verification_info*/,
+ IODebugContext* dbg) override {
+ return PositionedAppend(data, offset, options, dbg);
+ }
+
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ SystemClock* clock_;
+ // Stores file name instead of full path.
+ std::string file_name_;
+};
+
+// The FSWritableFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSWritableFile or FSWritableFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSWritableFileTracingWrapper when tracing is disabled.
+class FSWritableFilePtr {
+ public:
+ FSWritableFilePtr(std::unique_ptr<FSWritableFile>&& fs,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& file_name)
+ : io_tracer_(io_tracer) {
+ fs_tracer_.reset(new FSWritableFileTracingWrapper(
+ std::move(fs), io_tracer_,
+ file_name.substr(file_name.find_last_of("/\\") +
+ 1) /* pass file name */));
+ }
+
+ FSWritableFile* operator->() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return fs_tracer_.get();
+ } else {
+ return fs_tracer_->target();
+ }
+ }
+
+ FSWritableFile* get() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return fs_tracer_.get();
+ } else if (fs_tracer_) {
+ return fs_tracer_->target();
+ } else {
+ return nullptr;
+ }
+ }
+
+ void reset() {
+ fs_tracer_.reset();
+ io_tracer_ = nullptr;
+ }
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ std::unique_ptr<FSWritableFileTracingWrapper> fs_tracer_;
+};
+
+// FSRandomRWFileTracingWrapper is a wrapper class above FSRandomRWFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSRandomRWFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSRandomRWFileTracingWrapper : public FSRandomRWFileOwnerWrapper {
+ public:
+ FSRandomRWFileTracingWrapper(std::unique_ptr<FSRandomRWFile>&& t,
+ std::shared_ptr<IOTracer> io_tracer,
+ const std::string& file_name)
+ : FSRandomRWFileOwnerWrapper(std::move(t)),
+ io_tracer_(io_tracer),
+ clock_(SystemClock::Default().get()),
+ file_name_(file_name) {}
+
+ ~FSRandomRWFileTracingWrapper() override {}
+
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ SystemClock* clock_;
+ // Stores file name instead of full path.
+ std::string file_name_;
+};
+
+// The FSRandomRWFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSRandomRWFile or FSRandomRWFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSRandomRWFileTracingWrapper when tracing is disabled.
+class FSRandomRWFilePtr {
+ public:
+ FSRandomRWFilePtr(std::unique_ptr<FSRandomRWFile>&& fs,
+ std::shared_ptr<IOTracer> io_tracer,
+ const std::string& file_name)
+ : io_tracer_(io_tracer),
+ fs_tracer_(std::move(fs), io_tracer_,
+ file_name.substr(file_name.find_last_of("/\\") +
+ 1) /* pass file name */) {}
+
+ FSRandomRWFile* operator->() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_);
+ } else {
+ return fs_tracer_.target();
+ }
+ }
+
+ FSRandomRWFile* get() const {
+ if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+ return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_);
+ } else {
+ return fs_tracer_.target();
+ }
+ }
+
+ private:
+ std::shared_ptr<IOTracer> io_tracer_;
+ FSRandomRWFileTracingWrapper fs_tracer_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/fs_posix.cc b/src/rocksdb/env/fs_posix.cc
new file mode 100644
index 000000000..e179a421d
--- /dev/null
+++ b/src/rocksdb/env/fs_posix.cc
@@ -0,0 +1,1294 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#if !defined(OS_WIN)
+
+#include <dirent.h>
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
+#include <sys/statfs.h>
+#include <sys/sysmacros.h>
+#endif
+#include <sys/statvfs.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+
+#include <algorithm>
+// Get nano time includes
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
+#elif defined(__MACH__)
+#include <Availability.h>
+#include <mach/clock.h>
+#include <mach/mach.h>
+#else
+#include <chrono>
+#endif
+#include <deque>
+#include <set>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "env/io_posix.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/thread_local.h"
+#include "util/threadpool_imp.h"
+
+#if !defined(TMPFS_MAGIC)
+#define TMPFS_MAGIC 0x01021994
+#endif
+#if !defined(XFS_SUPER_MAGIC)
+#define XFS_SUPER_MAGIC 0x58465342
+#endif
+#if !defined(EXT4_SUPER_MAGIC)
+#define EXT4_SUPER_MAGIC 0xEF53
+#endif
+
+extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__));
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+inline mode_t GetDBFileMode(bool allow_non_owner_access) {
+ return allow_non_owner_access ? 0644 : 0600;
+}
+
+// list of pathnames that are locked
+// Only used for error message.
+struct LockHoldingInfo {
+ int64_t acquire_time;
+ uint64_t acquiring_thread;
+};
+static std::map<std::string, LockHoldingInfo> locked_files;
+static port::Mutex mutex_locked_files;
+
+static int LockOrUnlock(int fd, bool lock) {
+ errno = 0;
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = (lock ? F_WRLCK : F_UNLCK);
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 0; // Lock/unlock entire file
+ int value = fcntl(fd, F_SETLK, &f);
+
+ return value;
+}
+
+class PosixFileLock : public FileLock {
+ public:
+ int fd_ = /*invalid*/ -1;
+ std::string filename;
+
+ void Clear() {
+ fd_ = -1;
+ filename.clear();
+ }
+
+ virtual ~PosixFileLock() override {
+ // Check for destruction without UnlockFile
+ assert(fd_ == -1);
+ }
+};
+
+int cloexec_flags(int flags, const EnvOptions* options) {
+ // If the system supports opening the file with cloexec enabled,
+ // do so, as this avoids a race condition if a db is opened around
+ // the same time that a child process is forked
+#ifdef O_CLOEXEC
+ if (options == nullptr || options->set_fd_cloexec) {
+ flags |= O_CLOEXEC;
+ }
+#else
+ (void)options;
+#endif
+ return flags;
+}
+
+class PosixFileSystem : public FileSystem {
+ public:
+ PosixFileSystem();
+
+ static const char* kClassName() { return "PosixFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kDefaultName(); }
+
+ ~PosixFileSystem() override {}
+ bool IsInstanceOf(const std::string& name) const override {
+ if (name == "posix") {
+ return true;
+ } else {
+ return FileSystem::IsInstanceOf(name);
+ }
+ }
+
+ void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
+ if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
+ fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+ }
+ }
+
+ IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* /*dbg*/) override {
+ result->reset();
+ int fd = -1;
+ int flags = cloexec_flags(O_RDONLY, &options);
+ FILE* file = nullptr;
+
+ if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef ROCKSDB_LITE
+ return IOStatus::IOError(fname,
+ "Direct I/O not supported in RocksDB lite");
+#endif // !ROCKSDB_LITE
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+ flags |= O_DIRECT;
+ TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
+#endif
+ }
+
+ do {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+ } while (fd < 0 && errno == EINTR);
+ if (fd < 0) {
+ return IOError("While opening a file for sequentially reading", fname,
+ errno);
+ }
+
+ SetFD_CLOEXEC(fd, &options);
+
+ if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef OS_MACOSX
+ if (fcntl(fd, F_NOCACHE, 1) == -1) {
+ close(fd);
+ return IOError("While fcntl NoCache", fname, errno);
+ }
+#endif
+ } else {
+ do {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ file = fdopen(fd, "r");
+ } while (file == nullptr && errno == EINTR);
+ if (file == nullptr) {
+ close(fd);
+ return IOError("While opening file for sequentially read", fname,
+ errno);
+ }
+ }
+ result->reset(new PosixSequentialFile(
+ fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
+ options));
+ return IOStatus::OK();
+ }
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* /*dbg*/) override {
+ result->reset();
+ IOStatus s = IOStatus::OK();
+ int fd;
+ int flags = cloexec_flags(O_RDONLY, &options);
+
+ if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef ROCKSDB_LITE
+ return IOStatus::IOError(fname,
+ "Direct I/O not supported in RocksDB lite");
+#endif // !ROCKSDB_LITE
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+ flags |= O_DIRECT;
+ TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
+#endif
+ }
+
+ do {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+ } while (fd < 0 && errno == EINTR);
+ if (fd < 0) {
+ s = IOError("While open a file for random read", fname, errno);
+ return s;
+ }
+ SetFD_CLOEXEC(fd, &options);
+
+ if (options.use_mmap_reads) {
+ // Use of mmap for random reads has been removed because it
+ // kills performance when storage is fast.
+ // Use mmap when virtual address-space is plentiful.
+ uint64_t size;
+ IOOptions opts;
+ s = GetFileSize(fname, opts, &size, nullptr);
+ if (s.ok()) {
+ void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
+ if (base != MAP_FAILED) {
+ result->reset(
+ new PosixMmapReadableFile(fd, fname, base, size, options));
+ } else {
+ s = IOError("while mmap file for read", fname, errno);
+ close(fd);
+ }
+ } else {
+ close(fd);
+ }
+ } else {
+ if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef OS_MACOSX
+ if (fcntl(fd, F_NOCACHE, 1) == -1) {
+ close(fd);
+ return IOError("while fcntl NoCache", fname, errno);
+ }
+#endif
+ }
+ result->reset(new PosixRandomAccessFile(
+ fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
+ options
+#if defined(ROCKSDB_IOURING_PRESENT)
+ ,
+ !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get()
+#endif
+ ));
+ }
+ return s;
+ }
+
+ virtual IOStatus OpenWritableFile(const std::string& fname,
+ const FileOptions& options, bool reopen,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* /*dbg*/) {
+ result->reset();
+ IOStatus s;
+ int fd = -1;
+ int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
+ // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
+ if (options.use_direct_writes && !options.use_mmap_writes) {
+ // Note: we should avoid O_APPEND here due to ta the following bug:
+ // POSIX requires that opening a file with the O_APPEND flag should
+ // have no affect on the location at which pwrite() writes data.
+ // However, on Linux, if a file is opened with O_APPEND, pwrite()
+ // appends data to the end of the file, regardless of the value of
+ // offset.
+ // More info here: https://linux.die.net/man/2/pwrite
+#ifdef ROCKSDB_LITE
+ return IOStatus::IOError(fname,
+ "Direct I/O not supported in RocksDB lite");
+#endif // ROCKSDB_LITE
+ flags |= O_WRONLY;
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+ flags |= O_DIRECT;
+#endif
+ TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
+ } else if (options.use_mmap_writes) {
+ // non-direct I/O
+ flags |= O_RDWR;
+ } else {
+ flags |= O_WRONLY;
+ }
+
+ flags = cloexec_flags(flags, &options);
+
+ do {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+ } while (fd < 0 && errno == EINTR);
+
+ if (fd < 0) {
+ s = IOError("While open a file for appending", fname, errno);
+ return s;
+ }
+ SetFD_CLOEXEC(fd, &options);
+
+ if (options.use_mmap_writes) {
+ MaybeForceDisableMmap(fd);
+ }
+ if (options.use_mmap_writes && !forceMmapOff_) {
+ result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+ } else if (options.use_direct_writes && !options.use_mmap_writes) {
+#ifdef OS_MACOSX
+ if (fcntl(fd, F_NOCACHE, 1) == -1) {
+ close(fd);
+ s = IOError("While fcntl NoCache an opened file for appending", fname,
+ errno);
+ return s;
+ }
+#elif defined(OS_SOLARIS)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
+ close(fd);
+ s = IOError("While calling directio()", fname, errno);
+ return s;
+ }
+ }
+#endif
+ result->reset(new PosixWritableFile(
+ fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
+ options));
+ } else {
+ // disable mmap writes
+ EnvOptions no_mmap_writes_options = options;
+ no_mmap_writes_options.use_mmap_writes = false;
+ result->reset(
+ new PosixWritableFile(fname, fd,
+ GetLogicalBlockSizeForWriteIfNeeded(
+ no_mmap_writes_options, fname, fd),
+ no_mmap_writes_options));
+ }
+ return s;
+ }
+
+ IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ return OpenWritableFile(fname, options, false, result, dbg);
+ }
+
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ return OpenWritableFile(fname, options, true, result, dbg);
+ }
+
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* /*dbg*/) override {
+ result->reset();
+ IOStatus s;
+ int fd = -1;
+
+ int flags = 0;
+ // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
+ if (options.use_direct_writes && !options.use_mmap_writes) {
+#ifdef ROCKSDB_LITE
+ return IOStatus::IOError(fname,
+ "Direct I/O not supported in RocksDB lite");
+#endif // !ROCKSDB_LITE
+ flags |= O_WRONLY;
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+ flags |= O_DIRECT;
+#endif
+ TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
+ } else if (options.use_mmap_writes) {
+ // mmap needs O_RDWR mode
+ flags |= O_RDWR;
+ } else {
+ flags |= O_WRONLY;
+ }
+
+ flags = cloexec_flags(flags, &options);
+
+ do {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(old_fname.c_str(), flags,
+ GetDBFileMode(allow_non_owner_access_));
+ } while (fd < 0 && errno == EINTR);
+ if (fd < 0) {
+ s = IOError("while reopen file for write", fname, errno);
+ return s;
+ }
+
+ SetFD_CLOEXEC(fd, &options);
+ // rename into place
+ if (rename(old_fname.c_str(), fname.c_str()) != 0) {
+ s = IOError("while rename file to " + fname, old_fname, errno);
+ close(fd);
+ return s;
+ }
+
+ if (options.use_mmap_writes) {
+ MaybeForceDisableMmap(fd);
+ }
+ if (options.use_mmap_writes && !forceMmapOff_) {
+ result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+ } else if (options.use_direct_writes && !options.use_mmap_writes) {
+#ifdef OS_MACOSX
+ if (fcntl(fd, F_NOCACHE, 1) == -1) {
+ close(fd);
+ s = IOError("while fcntl NoCache for reopened file for append", fname,
+ errno);
+ return s;
+ }
+#elif defined(OS_SOLARIS)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
+ close(fd);
+ s = IOError("while calling directio()", fname, errno);
+ return s;
+ }
+ }
+#endif
+ result->reset(new PosixWritableFile(
+ fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
+ options));
+ } else {
+ // disable mmap writes
+ FileOptions no_mmap_writes_options = options;
+ no_mmap_writes_options.use_mmap_writes = false;
+ result->reset(
+ new PosixWritableFile(fname, fd,
+ GetLogicalBlockSizeForWriteIfNeeded(
+ no_mmap_writes_options, fname, fd),
+ no_mmap_writes_options));
+ }
+ return s;
+ }
+
+ IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* /*dbg*/) override {
+ int fd = -1;
+ int flags = cloexec_flags(O_RDWR, &options);
+
+ while (fd < 0) {
+ IOSTATS_TIMER_GUARD(open_nanos);
+
+ fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+ if (fd < 0) {
+ // Error while opening the file
+ if (errno == EINTR) {
+ continue;
+ }
+ return IOError("While open file for random read/write", fname, errno);
+ }
+ }
+
+ SetFD_CLOEXEC(fd, &options);
+ result->reset(new PosixRandomRWFile(fname, fd, options));
+ return IOStatus::OK();
+ }
+
+ IOStatus NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+ int fd = -1;
+ IOStatus status;
+ int flags = cloexec_flags(O_RDWR, nullptr);
+
+ while (fd < 0) {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(fname.c_str(), flags, 0644);
+ if (fd < 0) {
+ // Error while opening the file
+ if (errno == EINTR) {
+ continue;
+ }
+ status =
+ IOError("While open file for raw mmap buffer access", fname, errno);
+ break;
+ }
+ }
+ uint64_t size;
+ if (status.ok()) {
+ IOOptions opts;
+ status = GetFileSize(fname, opts, &size, nullptr);
+ }
+ void* base = nullptr;
+ if (status.ok()) {
+ base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (base == MAP_FAILED) {
+ status = IOError("while mmap file for read", fname, errno);
+ }
+ }
+ if (status.ok()) {
+ result->reset(
+ new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
+ }
+ if (fd >= 0) {
+ // don't need to keep it open after mmap has been called
+ close(fd);
+ }
+ return status;
+ }
+
+ IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* /*dbg*/) override {
+ result->reset();
+ int fd;
+ int flags = cloexec_flags(0, nullptr);
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(name.c_str(), flags);
+ }
+ if (fd < 0) {
+ return IOError("While open directory", name, errno);
+ } else {
+ result->reset(new PosixDirectory(fd, name));
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ int result = access(fname.c_str(), F_OK);
+
+ if (result == 0) {
+ return IOStatus::OK();
+ }
+
+ int err = errno;
+ switch (err) {
+ case EACCES:
+ case ELOOP:
+ case ENAMETOOLONG:
+ case ENOENT:
+ case ENOTDIR:
+ return IOStatus::NotFound();
+ default:
+ assert(err == EIO || err == ENOMEM);
+ return IOStatus::IOError("Unexpected error(" + std::to_string(err) +
+ ") accessing file `" + fname + "' ");
+ }
+ }
+
+ IOStatus GetChildren(const std::string& dir, const IOOptions& opts,
+ std::vector<std::string>* result,
+ IODebugContext* /*dbg*/) override {
+ result->clear();
+
+ DIR* d = opendir(dir.c_str());
+ if (d == nullptr) {
+ switch (errno) {
+ case EACCES:
+ case ENOENT:
+ case ENOTDIR:
+ return IOStatus::NotFound();
+ default:
+ return IOError("While opendir", dir, errno);
+ }
+ }
+
+ // reset errno before calling readdir()
+ errno = 0;
+ struct dirent* entry;
+
+ while ((entry = readdir(d)) != nullptr) {
+ // filter out '.' and '..' directory entries
+ // which appear only on some platforms
+ const bool ignore =
+ entry->d_type == DT_DIR &&
+ (strcmp(entry->d_name, ".") == 0 ||
+ strcmp(entry->d_name, "..") == 0
+#ifndef ASSERT_STATUS_CHECKED
+ // In case of ASSERT_STATUS_CHECKED, GetChildren support older
+ // version of API for debugging purpose.
+ || opts.do_not_recurse
+#endif
+ );
+ if (!ignore) {
+ result->push_back(entry->d_name);
+ }
+ errno = 0; // reset errno if readdir() success
+ }
+
+ // always attempt to close the dir
+ const auto pre_close_errno = errno; // errno may be modified by closedir
+ const int close_result = closedir(d);
+
+ if (pre_close_errno != 0) {
+ // error occurred during readdir
+ return IOError("While readdir", dir, pre_close_errno);
+ }
+
+ if (close_result != 0) {
+ // error occurred during closedir
+ return IOError("While closedir", dir, errno);
+ }
+
+ return IOStatus::OK();
+ }
+
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ IOStatus result;
+ if (unlink(fname.c_str()) != 0) {
+ result = IOError("while unlink() file", fname, errno);
+ }
+ return result;
+ }
+
+ IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ if (mkdir(name.c_str(), 0755) != 0) {
+ return IOError("While mkdir", name, errno);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus CreateDirIfMissing(const std::string& name,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ if (mkdir(name.c_str(), 0755) != 0) {
+ if (errno != EEXIST) {
+ return IOError("While mkdir if missing", name, errno);
+ } else if (!DirExists(name)) { // Check that name is actually a
+ // directory.
+ // Message is taken from mkdir
+ return IOStatus::IOError("`" + name +
+ "' exists but is not a directory");
+ }
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ if (rmdir(name.c_str()) != 0) {
+ return IOError("file rmdir", name, errno);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
+ uint64_t* size, IODebugContext* /*dbg*/) override {
+ struct stat sbuf;
+ if (stat(fname.c_str(), &sbuf) != 0) {
+ *size = 0;
+ return IOError("while stat a file for size", fname, errno);
+ } else {
+ *size = sbuf.st_size;
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& /*opts*/,
+ uint64_t* file_mtime,
+ IODebugContext* /*dbg*/) override {
+ struct stat s;
+ if (stat(fname.c_str(), &s) != 0) {
+ return IOError("while stat a file for modification time", fname, errno);
+ }
+ *file_mtime = static_cast<uint64_t>(s.st_mtime);
+ return IOStatus::OK();
+ }
+
+ IOStatus RenameFile(const std::string& src, const std::string& target,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ if (rename(src.c_str(), target.c_str()) != 0) {
+ return IOError("While renaming a file to " + target, src, errno);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus LinkFile(const std::string& src, const std::string& target,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ if (link(src.c_str(), target.c_str()) != 0) {
+ if (errno == EXDEV || errno == ENOTSUP) {
+ return IOStatus::NotSupported(errno == EXDEV
+ ? "No cross FS links allowed"
+ : "Links not supported by FS");
+ }
+ return IOError("while link file to " + target, src, errno);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
+ uint64_t* count, IODebugContext* /*dbg*/) override {
+ struct stat s;
+ if (stat(fname.c_str(), &s) != 0) {
+ return IOError("while stat a file for num file links", fname, errno);
+ }
+ *count = static_cast<uint64_t>(s.st_nlink);
+ return IOStatus::OK();
+ }
+
+ IOStatus AreFilesSame(const std::string& first, const std::string& second,
+ const IOOptions& /*opts*/, bool* res,
+ IODebugContext* /*dbg*/) override {
+ struct stat statbuf[2];
+ if (stat(first.c_str(), &statbuf[0]) != 0) {
+ return IOError("stat file", first, errno);
+ }
+ if (stat(second.c_str(), &statbuf[1]) != 0) {
+ return IOError("stat file", second, errno);
+ }
+
+ if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
+ minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
+ statbuf[0].st_ino != statbuf[1].st_ino) {
+ *res = false;
+ } else {
+ *res = true;
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
+ FileLock** lock, IODebugContext* /*dbg*/) override {
+ *lock = nullptr;
+
+ LockHoldingInfo lhi;
+ int64_t current_time = 0;
+ // Ignore status code as the time is only used for error message.
+ SystemClock::Default()
+ ->GetCurrentTime(&current_time)
+ .PermitUncheckedError();
+ lhi.acquire_time = current_time;
+ lhi.acquiring_thread = Env::Default()->GetThreadID();
+
+ mutex_locked_files.Lock();
+ // If it already exists in the locked_files set, then it is already locked,
+ // and fail this lock attempt. Otherwise, insert it into locked_files.
+ // This check is needed because fcntl() does not detect lock conflict
+ // if the fcntl is issued by the same thread that earlier acquired
+ // this lock.
+ // We must do this check *before* opening the file:
+ // Otherwise, we will open a new file descriptor. Locks are associated with
+ // a process, not a file descriptor and when *any* file descriptor is
+ // closed, all locks the process holds for that *file* are released
+ const auto it_success = locked_files.insert({fname, lhi});
+ if (it_success.second == false) {
+ LockHoldingInfo prev_info = it_success.first->second;
+ mutex_locked_files.Unlock();
+ errno = ENOLCK;
+ // Note that the thread ID printed is the same one as the one in
+ // posix logger, but posix logger prints it hex format.
+ return IOError("lock hold by current process, acquire time " +
+ std::to_string(prev_info.acquire_time) +
+ " acquiring thread " +
+ std::to_string(prev_info.acquiring_thread),
+ fname, errno);
+ }
+
+ IOStatus result = IOStatus::OK();
+ int fd;
+ int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
+
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(fname.c_str(), flags, 0644);
+ }
+ if (fd < 0) {
+ result = IOError("while open a file for lock", fname, errno);
+ } else if (LockOrUnlock(fd, true) == -1) {
+ result = IOError("While lock file", fname, errno);
+ close(fd);
+ } else {
+ SetFD_CLOEXEC(fd, nullptr);
+ PosixFileLock* my_lock = new PosixFileLock;
+ my_lock->fd_ = fd;
+ my_lock->filename = fname;
+ *lock = my_lock;
+ }
+ if (!result.ok()) {
+ // If there is an error in locking, then remove the pathname from
+ // locked_files. (If we got this far, it did not exist in locked_files
+ // before this call.)
+ locked_files.erase(fname);
+ }
+
+ mutex_locked_files.Unlock();
+ return result;
+ }
+
+ IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
+ IOStatus result;
+ mutex_locked_files.Lock();
+ // If we are unlocking, then verify that we had locked it earlier,
+ // it should already exist in locked_files. Remove it from locked_files.
+ if (locked_files.erase(my_lock->filename) != 1) {
+ errno = ENOLCK;
+ result = IOError("unlock", my_lock->filename, errno);
+ } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
+ result = IOError("unlock", my_lock->filename, errno);
+ }
+ close(my_lock->fd_);
+ my_lock->Clear();
+ delete my_lock;
+ mutex_locked_files.Unlock();
+ return result;
+ }
+
+ IOStatus GetAbsolutePath(const std::string& db_path,
+ const IOOptions& /*opts*/, std::string* output_path,
+ IODebugContext* /*dbg*/) override {
+ if (!db_path.empty() && db_path[0] == '/') {
+ *output_path = db_path;
+ return IOStatus::OK();
+ }
+
+ char the_path[4096];
+ char* ret = getcwd(the_path, 4096);
+ if (ret == nullptr) {
+ return IOStatus::IOError(errnoStr(errno).c_str());
+ }
+
+ *output_path = ret;
+ return IOStatus::OK();
+ }
+
+ IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result,
+ IODebugContext* /*dbg*/) override {
+ const char* env = getenv("TEST_TMPDIR");
+ if (env && env[0] != '\0') {
+ *result = env;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
+ *result = buf;
+ }
+ // Directory may already exist
+ {
+ IOOptions opts;
+ return CreateDirIfMissing(*result, opts, nullptr);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
+ uint64_t* free_space,
+ IODebugContext* /*dbg*/) override {
+ struct statvfs sbuf;
+
+ if (statvfs(fname.c_str(), &sbuf) < 0) {
+ return IOError("While doing statvfs", fname, errno);
+ }
+
+ // sbuf.bfree is total free space available to root
+ // sbuf.bavail is total free space available to unprivileged user
+ // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id
+ if (geteuid()) {
+ // non-zero user is unprivileged, or -1 if error. take more conservative
+ // size
+ *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail);
+ } else {
+ // root user can access all disk space
+ *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/,
+ bool* is_dir, IODebugContext* /*dbg*/) override {
+ // First open
+ int fd = -1;
+ int flags = cloexec_flags(O_RDONLY, nullptr);
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(path.c_str(), flags);
+ }
+ if (fd < 0) {
+ return IOError("While open for IsDirectory()", path, errno);
+ }
+ IOStatus io_s;
+ struct stat sbuf;
+ if (fstat(fd, &sbuf) < 0) {
+ io_s = IOError("While doing stat for IsDirectory()", path, errno);
+ }
+ close(fd);
+ if (io_s.ok() && nullptr != is_dir) {
+ *is_dir = S_ISDIR(sbuf.st_mode);
+ }
+ return io_s;
+ }
+
+ FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const override {
+ FileOptions optimized = file_options;
+ optimized.use_mmap_writes = false;
+ optimized.use_direct_writes = false;
+ optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
+ // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
+ // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
+ // test and make this false
+ optimized.fallocate_with_keep_size = true;
+ optimized.writable_file_max_buffer_size =
+ db_options.writable_file_max_buffer_size;
+ return optimized;
+ }
+
+ FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const override {
+ FileOptions optimized = file_options;
+ optimized.use_mmap_writes = false;
+ optimized.use_direct_writes = false;
+ optimized.fallocate_with_keep_size = true;
+ return optimized;
+ }
+#ifdef OS_LINUX
+ Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+ return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
+ }
+ Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+ logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+ return Status::OK();
+ }
+#endif
+ private:
+ bool forceMmapOff_ = false; // do we override Env options?
+
+ // Returns true iff the named directory exists and is a directory.
+ virtual bool DirExists(const std::string& dname) {
+ struct stat statbuf;
+ if (stat(dname.c_str(), &statbuf) == 0) {
+ return S_ISDIR(statbuf.st_mode);
+ }
+ return false; // stat() failed return false
+ }
+
+ bool SupportsFastAllocate(int fd) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ struct statfs s;
+ if (fstatfs(fd, &s)) {
+ return false;
+ }
+ switch (s.f_type) {
+ case EXT4_SUPER_MAGIC:
+ return true;
+ case XFS_SUPER_MAGIC:
+ return true;
+ case TMPFS_MAGIC:
+ return true;
+ default:
+ return false;
+ }
+#else
+ (void)fd;
+ return false;
+#endif
+ }
+
+ void MaybeForceDisableMmap(int fd) {
+ static std::once_flag s_check_disk_for_mmap_once;
+ assert(this == FileSystem::Default().get());
+ std::call_once(
+ s_check_disk_for_mmap_once,
+ [this](int fdesc) {
+ // this will be executed once in the program's lifetime.
+ // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+ if (!SupportsFastAllocate(fdesc)) {
+ forceMmapOff_ = true;
+ }
+ },
+ fd);
+ }
+
+#ifdef ROCKSDB_IOURING_PRESENT
+ bool IsIOUringEnabled() {
+ if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+#endif // ROCKSDB_IOURING_PRESENT
+
+ // EXPERIMENTAL
+ //
+ // TODO akankshamahajan:
+ // 1. Update Poll API to take into account min_completions
+ // and returns if number of handles in io_handles (any order) completed is
+ // equal to atleast min_completions.
+ // 2. Currently in case of direct_io, Read API is called because of which call
+ // to Poll API fails as it expects IOHandle to be populated.
+ virtual IOStatus Poll(std::vector<void*>& io_handles,
+ size_t /*min_completions*/) override {
+#if defined(ROCKSDB_IOURING_PRESENT)
+ // io_uring_queue_init.
+ struct io_uring* iu = nullptr;
+ if (thread_local_io_urings_) {
+ iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+ }
+
+ // Init failed, platform doesn't support io_uring.
+ if (iu == nullptr) {
+ return IOStatus::NotSupported("Poll");
+ }
+
+ for (size_t i = 0; i < io_handles.size(); i++) {
+ // The request has been completed in earlier runs.
+ if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
+ continue;
+ }
+ // Loop until IO for io_handles[i] is completed.
+ while (true) {
+ // io_uring_wait_cqe.
+ struct io_uring_cqe* cqe = nullptr;
+ ssize_t ret = io_uring_wait_cqe(iu, &cqe);
+ if (ret) {
+ // abort as it shouldn't be in indeterminate state and there is no
+ // good way currently to handle this error.
+ abort();
+ }
+
+ // Step 3: Populate the request.
+ assert(cqe != nullptr);
+ Posix_IOHandle* posix_handle =
+ static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
+ assert(posix_handle->iu == iu);
+ if (posix_handle->iu != iu) {
+ return IOStatus::IOError("");
+ }
+ // Reset cqe data to catch any stray reuse of it
+ static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+
+ FSReadRequest req;
+ req.scratch = posix_handle->scratch;
+ req.offset = posix_handle->offset;
+ req.len = posix_handle->len;
+
+ size_t finished_len = 0;
+ size_t bytes_read = 0;
+ bool read_again = false;
+ UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len,
+ true /*async_read*/, posix_handle->use_direct_io,
+ posix_handle->alignment, finished_len, &req, bytes_read,
+ read_again);
+ posix_handle->is_finished = true;
+ io_uring_cqe_seen(iu, cqe);
+ posix_handle->cb(req, posix_handle->cb_arg);
+
+ (void)finished_len;
+ (void)bytes_read;
+ (void)read_again;
+
+ if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+ break;
+ }
+ }
+ }
+ return IOStatus::OK();
+#else
+ (void)io_handles;
+ return IOStatus::NotSupported("Poll");
+#endif
+ }
+
+ virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
+#if defined(ROCKSDB_IOURING_PRESENT)
+ // io_uring_queue_init.
+ struct io_uring* iu = nullptr;
+ if (thread_local_io_urings_) {
+ iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+ }
+
+ // Init failed, platform doesn't support io_uring.
+ // If Poll is not supported then it didn't submit any request and it should
+ // return OK.
+ if (iu == nullptr) {
+ return IOStatus::OK();
+ }
+
+ for (size_t i = 0; i < io_handles.size(); i++) {
+ Posix_IOHandle* posix_handle =
+ static_cast<Posix_IOHandle*>(io_handles[i]);
+ if (posix_handle->is_finished == true) {
+ continue;
+ }
+ assert(posix_handle->iu == iu);
+ if (posix_handle->iu != iu) {
+ return IOStatus::IOError("");
+ }
+
+ // Prepare the cancel request.
+ struct io_uring_sqe* sqe;
+ sqe = io_uring_get_sqe(iu);
+
+ // In order to cancel the request, sqe->addr of cancel request should
+ // match with the read request submitted which is posix_handle->iov.
+ io_uring_prep_cancel(sqe, &posix_handle->iov, 0);
+ // Sets sqe->user_data to posix_handle.
+ io_uring_sqe_set_data(sqe, posix_handle);
+
+ // submit the request.
+ ssize_t ret = io_uring_submit(iu);
+ if (ret < 0) {
+ fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
+ return IOStatus::IOError("io_uring_submit() requested but returned " +
+ std::to_string(ret));
+ }
+ }
+
+ // After submitting the requests, wait for the requests.
+ for (size_t i = 0; i < io_handles.size(); i++) {
+ if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
+ continue;
+ }
+
+ while (true) {
+ struct io_uring_cqe* cqe = nullptr;
+ ssize_t ret = io_uring_wait_cqe(iu, &cqe);
+ if (ret) {
+ // abort as it shouldn't be in indeterminate state and there is no
+ // good way currently to handle this error.
+ abort();
+ }
+ assert(cqe != nullptr);
+
+ // Returns cqe->user_data.
+ Posix_IOHandle* posix_handle =
+ static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
+ assert(posix_handle->iu == iu);
+ if (posix_handle->iu != iu) {
+ return IOStatus::IOError("");
+ }
+ posix_handle->req_count++;
+
+ // Reset cqe data to catch any stray reuse of it
+ static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+ io_uring_cqe_seen(iu, cqe);
+
+ // - If the request is cancelled successfully, the original request is
+ // completed with -ECANCELED and the cancel request is completed with
+ // a result of 0.
+ // - If the request was already running, the original may or
+ // may not complete in error. The cancel request will complete with
+ // -EALREADY for that case.
+ // - And finally, if the request to cancel wasn't
+ // found, the cancel request is completed with -ENOENT.
+ //
+ // Every handle has to wait for 2 requests completion: original one and
+ // the cancel request which is tracked by PosixHandle::req_count.
+ if (posix_handle->req_count == 2 &&
+ static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+ posix_handle->is_finished = true;
+ FSReadRequest req;
+ req.status = IOStatus::Aborted();
+ posix_handle->cb(req, posix_handle->cb_arg);
+
+ break;
+ }
+ }
+ }
+ return IOStatus::OK();
+#else
+ // If Poll is not supported then it didn't submit any request and it should
+ // return OK.
+ (void)io_handles;
+ return IOStatus::OK();
+#endif
+ }
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+ // io_uring instance
+ std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
+#endif
+
+ size_t page_size_;
+
+ // If true, allow non owner read access for db files. Otherwise, non-owner
+ // has no access to db files.
+ bool allow_non_owner_access_;
+
+#ifdef OS_LINUX
+ static LogicalBlockSizeCache logical_block_size_cache_;
+#endif
+ static size_t GetLogicalBlockSize(const std::string& fname, int fd);
+ // In non-direct IO mode, this directly returns kDefaultPageSize.
+ // Otherwise call GetLogicalBlockSize.
+ static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options,
+ const std::string& fname,
+ int fd);
+ static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options,
+ const std::string& fname,
+ int fd);
+};
+
+#ifdef OS_LINUX
+LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_;
+#endif
+
+size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) {
+#ifdef OS_LINUX
+ return logical_block_size_cache_.GetLogicalBlockSize(fname, fd);
+#else
+ (void)fname;
+ return PosixHelper::GetLogicalBlockSizeOfFd(fd);
+#endif
+}
+
+size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded(
+ const EnvOptions& options, const std::string& fname, int fd) {
+ return options.use_direct_reads
+ ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
+ : kDefaultPageSize;
+}
+
+size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
+ const EnvOptions& options, const std::string& fname, int fd) {
+ return options.use_direct_writes
+ ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
+ : kDefaultPageSize;
+}
+
+PosixFileSystem::PosixFileSystem()
+ : forceMmapOff_(false),
+ page_size_(getpagesize()),
+ allow_non_owner_access_(true) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+ // Test whether IOUring is supported, and if it does, create a managing
+ // object for thread local point so that in the future thread-local
+ // io_uring can be created.
+ struct io_uring* new_io_uring = CreateIOUring();
+ if (new_io_uring != nullptr) {
+ thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
+ delete new_io_uring;
+ }
+#endif
+}
+
+} // namespace
+
+//
+// Default Posix FileSystem
+//
+std::shared_ptr<FileSystem> FileSystem::Default() {
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<FileSystem>, instance)
+ (std::make_shared<PosixFileSystem>());
+ return instance;
+}
+
+#ifndef ROCKSDB_LITE
+static FactoryFunc<FileSystem> posix_filesystem_reg =
+ ObjectLibrary::Default()->AddFactory<FileSystem>(
+ ObjectLibrary::PatternEntry("posix").AddSeparator("://", false),
+ [](const std::string& /* uri */, std::unique_ptr<FileSystem>* f,
+ std::string* /* errmsg */) {
+ f->reset(new PosixFileSystem());
+ return f->get();
+ });
+#endif
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/env/fs_readonly.h b/src/rocksdb/env/fs_readonly.h
new file mode 100644
index 000000000..1bbe60784
--- /dev/null
+++ b/src/rocksdb/env/fs_readonly.h
@@ -0,0 +1,107 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem wrapper that only allows read-only operation.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class ReadOnlyFileSystem : public FileSystemWrapper {
+ static inline IOStatus FailReadOnly() {
+ IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem");
+ assert(s.GetRetryable() == false);
+ return s;
+ }
+
+ public:
+ explicit ReadOnlyFileSystem(const std::shared_ptr<FileSystem>& base)
+ : FileSystemWrapper(base) {}
+
+ static const char* kClassName() { return "ReadOnlyFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewWritableFile(const std::string& /*fname*/,
+ const FileOptions& /*options*/,
+ std::unique_ptr<FSWritableFile>* /*result*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus ReuseWritableFile(const std::string& /*fname*/,
+ const std::string& /*old_fname*/,
+ const FileOptions& /*options*/,
+ std::unique_ptr<FSWritableFile>* /*result*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus NewRandomRWFile(const std::string& /*fname*/,
+ const FileOptions& /*options*/,
+ std::unique_ptr<FSRandomRWFile>* /*result*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus NewDirectory(const std::string& /*dir*/,
+ const IOOptions& /*options*/,
+ std::unique_ptr<FSDirectory>* /*result*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus DeleteFile(const std::string& /*fname*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus CreateDir(const std::string& /*dirname*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) override {
+ // Allow if dir already exists
+ bool is_dir = false;
+ IOStatus s = IsDirectory(dirname, options, &is_dir, dbg);
+ if (s.ok() && is_dir) {
+ return s;
+ } else {
+ return FailReadOnly();
+ }
+ }
+ IOStatus DeleteDir(const std::string& /*dirname*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/,
+ FileLock** /*lock*/, IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+ IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/,
+ std::shared_ptr<Logger>* /*result*/,
+ IODebugContext* /*dbg*/) override {
+ return FailReadOnly();
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/env/fs_remap.cc b/src/rocksdb/env/fs_remap.cc
new file mode 100644
index 000000000..fd9241181
--- /dev/null
+++ b/src/rocksdb/env/fs_remap.cc
@@ -0,0 +1,343 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "env/fs_remap.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RemapFileSystem::RemapFileSystem(const std::shared_ptr<FileSystem>& base)
+ : FileSystemWrapper(base) {}
+
+std::pair<IOStatus, std::string> RemapFileSystem::EncodePathWithNewBasename(
+ const std::string& path) {
+ // No difference by default
+ return EncodePath(path);
+}
+
+Status RemapFileSystem::RegisterDbPaths(const std::vector<std::string>& paths) {
+ std::vector<std::string> encoded_paths;
+ encoded_paths.reserve(paths.size());
+ for (auto& path : paths) {
+ auto status_and_enc_path = EncodePathWithNewBasename(path);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ encoded_paths.emplace_back(status_and_enc_path.second);
+ }
+ return FileSystemWrapper::RegisterDbPaths(encoded_paths);
+}
+
+Status RemapFileSystem::UnregisterDbPaths(
+ const std::vector<std::string>& paths) {
+ std::vector<std::string> encoded_paths;
+ encoded_paths.reserve(paths.size());
+ for (auto& path : paths) {
+ auto status_and_enc_path = EncodePathWithNewBasename(path);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ encoded_paths.emplace_back(status_and_enc_path.second);
+ }
+ return FileSystemWrapper::UnregisterDbPaths(encoded_paths);
+}
+
+IOStatus RemapFileSystem::NewSequentialFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second,
+ options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomAccessFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second,
+ options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewWritableFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options,
+ result, dbg);
+}
+
+IOStatus RemapFileSystem::ReuseWritableFile(
+ const std::string& fname, const std::string& old_fname,
+ const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ auto status_and_old_enc_path = EncodePath(old_fname);
+ if (!status_and_old_enc_path.first.ok()) {
+ return status_and_old_enc_path.first;
+ }
+ return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second,
+ status_and_old_enc_path.second,
+ options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomRWFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options,
+ result, dbg);
+}
+
+IOStatus RemapFileSystem::NewDirectory(const std::string& dir,
+ const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) {
+ // A hassle to remap DirFsyncOptions::renamed_new_name
+ class RemapFSDirectory : public FSDirectoryWrapper {
+ public:
+ RemapFSDirectory(RemapFileSystem* fs, std::unique_ptr<FSDirectory>&& t)
+ : FSDirectoryWrapper(std::move(t)), fs_(fs) {}
+ IOStatus FsyncWithDirOptions(
+ const IOOptions& options, IODebugContext* dbg,
+ const DirFsyncOptions& dir_fsync_options) override {
+ if (dir_fsync_options.renamed_new_name.empty()) {
+ return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg,
+ dir_fsync_options);
+ } else {
+ auto status_and_enc_path =
+ fs_->EncodePath(dir_fsync_options.renamed_new_name);
+ if (status_and_enc_path.first.ok()) {
+ DirFsyncOptions mapped_options = dir_fsync_options;
+ mapped_options.renamed_new_name = status_and_enc_path.second;
+ return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg,
+ mapped_options);
+ } else {
+ return status_and_enc_path.first;
+ }
+ }
+ }
+
+ private:
+ RemapFileSystem* const fs_;
+ };
+
+ auto status_and_enc_path = EncodePathWithNewBasename(dir);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ IOStatus ios = FileSystemWrapper::NewDirectory(status_and_enc_path.second,
+ options, result, dbg);
+ if (ios.ok()) {
+ *result = std::make_unique<RemapFSDirectory>(this, std::move(*result));
+ }
+ return ios;
+}
+
+IOStatus RemapFileSystem::FileExists(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::FileExists(status_and_enc_path.second, options,
+ dbg);
+}
+
+IOStatus RemapFileSystem::GetChildren(const std::string& dir,
+ const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(dir);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::GetChildren(status_and_enc_path.second, options,
+ result, dbg);
+}
+
+IOStatus RemapFileSystem::GetChildrenFileAttributes(
+ const std::string& dir, const IOOptions& options,
+ std::vector<FileAttributes>* result, IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(dir);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::GetChildrenFileAttributes(
+ status_and_enc_path.second, options, result, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteFile(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options,
+ dbg);
+}
+
+IOStatus RemapFileSystem::CreateDir(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second,
+ options, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteDir(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(dirname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileSize(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_size,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options,
+ file_size, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second,
+ options, file_mtime, dbg);
+}
+
+IOStatus RemapFileSystem::IsDirectory(const std::string& path,
+ const IOOptions& options, bool* is_dir,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePath(path);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options,
+ is_dir, dbg);
+}
+
+IOStatus RemapFileSystem::RenameFile(const std::string& src,
+ const std::string& dest,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_src_enc_path = EncodePath(src);
+ if (!status_and_src_enc_path.first.ok()) {
+ if (status_and_src_enc_path.first.IsNotFound()) {
+ const IOStatus& s = status_and_src_enc_path.first;
+ status_and_src_enc_path.first = IOStatus::PathNotFound(s.ToString());
+ }
+ return status_and_src_enc_path.first;
+ }
+ auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+ if (!status_and_dest_enc_path.first.ok()) {
+ return status_and_dest_enc_path.first;
+ }
+ return FileSystemWrapper::RenameFile(status_and_src_enc_path.second,
+ status_and_dest_enc_path.second, options,
+ dbg);
+}
+
+IOStatus RemapFileSystem::LinkFile(const std::string& src,
+ const std::string& dest,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto status_and_src_enc_path = EncodePath(src);
+ if (!status_and_src_enc_path.first.ok()) {
+ return status_and_src_enc_path.first;
+ }
+ auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+ if (!status_and_dest_enc_path.first.ok()) {
+ return status_and_dest_enc_path.first;
+ }
+ return FileSystemWrapper::LinkFile(status_and_src_enc_path.second,
+ status_and_dest_enc_path.second, options,
+ dbg);
+}
+
+IOStatus RemapFileSystem::LockFile(const std::string& fname,
+ const IOOptions& options, FileLock** lock,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ // FileLock subclasses may store path (e.g., PosixFileLock stores it). We
+ // can skip stripping the chroot directory from this path because callers
+ // shouldn't use it.
+ return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock,
+ dbg);
+}
+
+IOStatus RemapFileSystem::NewLogger(const std::string& fname,
+ const IOOptions& options,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(fname);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::NewLogger(status_and_enc_path.second, options,
+ result, dbg);
+}
+
+IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path,
+ const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) {
+ auto status_and_enc_path = EncodePathWithNewBasename(db_path);
+ if (!status_and_enc_path.first.ok()) {
+ return status_and_enc_path.first;
+ }
+ return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options,
+ output_path, dbg);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/env/fs_remap.h b/src/rocksdb/env/fs_remap.h
new file mode 100644
index 000000000..1f6e061fd
--- /dev/null
+++ b/src/rocksdb/env/fs_remap.h
@@ -0,0 +1,139 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An abstract FileSystem wrapper that creates a view of an existing
+// FileSystem by remapping names in some way.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class RemapFileSystem : public FileSystemWrapper {
+ public:
+ explicit RemapFileSystem(const std::shared_ptr<FileSystem>& base);
+
+ protected:
+ // Returns status and mapped-to path in the wrapped filesystem.
+ // If it returns non-OK status, the returned path should not be used.
+ virtual std::pair<IOStatus, std::string> EncodePath(
+ const std::string& path) = 0;
+
+ // Similar to EncodePath() except used in cases in which it is OK for
+ // no file or directory on 'path' to already exist, such as if the
+ // operation would create one. However, the parent of 'path' is expected
+ // to exist for the operation to succeed.
+ // Default implementation: call EncodePath
+ virtual std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+ const std::string& path);
+
+ public:
+ // Left abstract:
+ // const char* Name() const override { ... }
+ static const char* kClassName() { return "RemapFileSystem"; }
+ bool IsInstanceOf(const std::string& id) const override {
+ if (id == kClassName()) {
+ return true;
+ } else {
+ return FileSystemWrapper::IsInstanceOf(id);
+ }
+ }
+
+ Status RegisterDbPaths(const std::vector<std::string>& paths) override;
+
+ Status UnregisterDbPaths(const std::vector<std::string>& paths) override;
+
+ IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewDirectory(const std::string& dir, const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus FileExists(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+ uint64_t* file_size, IODebugContext* dbg) override;
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) override;
+
+ IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+ bool* is_dir, IODebugContext* dbg) override;
+
+ IOStatus RenameFile(const std::string& src, const std::string& dest,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus LinkFile(const std::string& src, const std::string& dest,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus LockFile(const std::string& fname, const IOOptions& options,
+ FileLock** lock, IODebugContext* dbg) override;
+
+ IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/env/io_posix.cc b/src/rocksdb/env/io_posix.cc
new file mode 100644
index 000000000..0ec0e9c83
--- /dev/null
+++ b/src/rocksdb/env/io_posix.cc
@@ -0,0 +1,1733 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef ROCKSDB_LIB_IO_POSIX
+#include "env/io_posix.h"
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <algorithm>
+#if defined(OS_LINUX)
+#include <linux/fs.h>
+#ifndef FALLOC_FL_KEEP_SIZE
+#include <linux/falloc.h>
+#endif
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef OS_LINUX
+#include <sys/statfs.h>
+#include <sys/sysmacros.h>
+#endif
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string IOErrorMsg(const std::string& context,
+ const std::string& file_name) {
+ if (file_name.empty()) {
+ return context;
+ }
+ return context + ": " + file_name;
+}
+
+// file_name can be left empty if it is not unkown.
+IOStatus IOError(const std::string& context, const std::string& file_name,
+ int err_number) {
+ switch (err_number) {
+ case ENOSPC: {
+ IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
+ errnoStr(err_number).c_str());
+ s.SetRetryable(true);
+ return s;
+ }
+ case ESTALE:
+ return IOStatus::IOError(IOStatus::kStaleFile);
+ case ENOENT:
+ return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
+ errnoStr(err_number).c_str());
+ default:
+ return IOStatus::IOError(IOErrorMsg(context, file_name),
+ errnoStr(err_number).c_str());
+ }
+}
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return 0.
+int Fadvise(int fd, off_t offset, size_t len, int advice) {
+#ifdef OS_LINUX
+ return posix_fadvise(fd, offset, len, advice);
+#else
+ (void)fd;
+ (void)offset;
+ (void)len;
+ (void)advice;
+ return 0; // simply do nothing.
+#endif
+}
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return 0.
+int Madvise(void* addr, size_t len, int advice) {
+#ifdef OS_LINUX
+ return posix_madvise(addr, len, advice);
+#else
+ (void)addr;
+ (void)len;
+ (void)advice;
+ return 0; // simply do nothing.
+#endif
+}
+
+namespace {
+
+// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
+// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
+// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
+// the writes aligned.
+
+bool PosixWrite(int fd, const char* buf, size_t nbyte) {
+ const size_t kLimit1Gb = 1UL << 30;
+
+ const char* src = buf;
+ size_t left = nbyte;
+
+ while (left != 0) {
+ size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+ ssize_t done = write(fd, src, bytes_to_write);
+ if (done < 0) {
+ if (errno == EINTR) {
+ continue;
+ }
+ return false;
+ }
+ left -= done;
+ src += done;
+ }
+ return true;
+}
+
+bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
+ const size_t kLimit1Gb = 1UL << 30;
+
+ const char* src = buf;
+ size_t left = nbyte;
+
+ while (left != 0) {
+ size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+ ssize_t done = pwrite(fd, src, bytes_to_write, offset);
+ if (done < 0) {
+ if (errno == EINTR) {
+ continue;
+ }
+ return false;
+ }
+ left -= done;
+ offset += done;
+ src += done;
+ }
+
+ return true;
+}
+
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+
+#if !defined(ZFS_SUPER_MAGIC)
+// The magic number for ZFS was not exposed until recently. It should be fixed
+// forever so we can just copy the magic number here.
+#define ZFS_SUPER_MAGIC 0x2fc12fc1
+#endif
+
+bool IsSyncFileRangeSupported(int fd) {
+ // This function tracks and checks for cases where we know `sync_file_range`
+ // definitely will not work properly despite passing the compile-time check
+ // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
+ // fail in unexpected ways, we allow `sync_file_range` to be used. This way
+ // should minimize risk of impacting existing use cases.
+ struct statfs buf;
+ int ret = fstatfs(fd, &buf);
+ assert(ret == 0);
+ if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
+ // Testing on ZFS showed the writeback did not happen asynchronously when
+ // `sync_file_range` was called, even though it returned success. Avoid it
+ // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
+ // even though this'll incur extra I/O for metadata.
+ return false;
+ }
+
+ ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
+ assert(!(ret == -1 && errno != ENOSYS));
+ if (ret == -1 && errno == ENOSYS) {
+ // `sync_file_range` is not implemented on all platforms even if
+ // compile-time checks pass and a supported filesystem is in-use. For
+ // example, using ext4 on WSL (Windows Subsystem for Linux),
+ // `sync_file_range()` returns `ENOSYS`
+ // ("Function not implemented").
+ return false;
+ }
+ // None of the known cases matched, so allow `sync_file_range` use.
+ return true;
+}
+
+#undef ZFS_SUPER_MAGIC
+
+#endif // ROCKSDB_RANGESYNC_PRESENT
+
+} // anonymous namespace
+
+/*
+ * PosixSequentialFile
+ */
+PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
+ int fd, size_t logical_block_size,
+ const EnvOptions& options)
+ : filename_(fname),
+ file_(file),
+ fd_(fd),
+ use_direct_io_(options.use_direct_reads),
+ logical_sector_size_(logical_block_size) {
+ assert(!options.use_direct_reads || !options.use_mmap_reads);
+}
+
+PosixSequentialFile::~PosixSequentialFile() {
+ if (!use_direct_io()) {
+ assert(file_);
+ fclose(file_);
+ } else {
+ assert(fd_);
+ close(fd_);
+ }
+}
+
+IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) {
+ assert(result != nullptr && !use_direct_io());
+ IOStatus s;
+ size_t r = 0;
+ do {
+ clearerr(file_);
+ r = fread_unlocked(scratch, 1, n, file_);
+ } while (r == 0 && ferror(file_) && errno == EINTR);
+ *result = Slice(scratch, r);
+ if (r < n) {
+ if (feof(file_)) {
+ // We leave status as ok if we hit the end of the file
+ // We also clear the error so that the reads can continue
+ // if a new data is written to the file
+ clearerr(file_);
+ } else {
+ // A partial read with an error: return a non-ok status
+ s = IOError("While reading file sequentially", filename_, errno);
+ }
+ }
+ return s;
+}
+
+IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) {
+ assert(use_direct_io());
+ assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+
+ IOStatus s;
+ ssize_t r = -1;
+ size_t left = n;
+ char* ptr = scratch;
+ while (left > 0) {
+ r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+ if (r <= 0) {
+ if (r == -1 && errno == EINTR) {
+ continue;
+ }
+ break;
+ }
+ ptr += r;
+ offset += r;
+ left -= r;
+ if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
+ // Bytes reads don't fill sectors. Should only happen at the end
+ // of the file.
+ break;
+ }
+ }
+ if (r < 0) {
+ // An error: return a non-ok status
+ s = IOError("While pread " + std::to_string(n) + " bytes from offset " +
+ std::to_string(offset),
+ filename_, errno);
+ }
+ *result = Slice(scratch, (r < 0) ? 0 : n - left);
+ return s;
+}
+
+IOStatus PosixSequentialFile::Skip(uint64_t n) {
+ if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
+ return IOError("While fseek to skip " + std::to_string(n) + " bytes",
+ filename_, errno);
+ }
+ return IOStatus::OK();
+}
+
+IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
+ return IOStatus::OK();
+#else
+ if (!use_direct_io()) {
+ // free OS pages
+ int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+ if (ret != 0) {
+ return IOError("While fadvise NotNeeded offset " +
+ std::to_string(offset) + " len " +
+ std::to_string(length),
+ filename_, errno);
+ }
+ }
+ return IOStatus::OK();
+#endif
+}
+
+/*
+ * PosixRandomAccessFile
+ */
+#if defined(OS_LINUX)
+size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
+ if (max_size < kMaxVarint64Length * 3) {
+ return 0;
+ }
+
+ struct stat buf;
+ int result = fstat(fd, &buf);
+ if (result == -1) {
+ return 0;
+ }
+
+ long version = 0;
+ result = ioctl(fd, FS_IOC_GETVERSION, &version);
+ TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
+ if (result == -1) {
+ return 0;
+ }
+ uint64_t uversion = (uint64_t)version;
+
+ char* rid = id;
+ rid = EncodeVarint64(rid, buf.st_dev);
+ rid = EncodeVarint64(rid, buf.st_ino);
+ rid = EncodeVarint64(rid, uversion);
+ assert(rid >= id);
+ return static_cast<size_t>(rid - id);
+}
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_AIX)
+size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
+ if (max_size < kMaxVarint64Length * 3) {
+ return 0;
+ }
+
+ struct stat buf;
+ int result = fstat(fd, &buf);
+ if (result == -1) {
+ return 0;
+ }
+
+ char* rid = id;
+ rid = EncodeVarint64(rid, buf.st_dev);
+ rid = EncodeVarint64(rid, buf.st_ino);
+ rid = EncodeVarint64(rid, buf.st_gen);
+ assert(rid >= id);
+ return static_cast<size_t>(rid - id);
+}
+#endif
+
+#ifdef OS_LINUX
+std::string RemoveTrailingSlash(const std::string& path) {
+ std::string p = path;
+ if (p.size() > 1 && p.back() == '/') {
+ p.pop_back();
+ }
+ return p;
+}
+
+Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
+ const std::vector<std::string>& directories) {
+ std::vector<std::string> dirs;
+ dirs.reserve(directories.size());
+ for (auto& d : directories) {
+ dirs.emplace_back(RemoveTrailingSlash(d));
+ }
+
+ std::map<std::string, size_t> dir_sizes;
+ {
+ ReadLock lock(&cache_mutex_);
+ for (const auto& dir : dirs) {
+ if (cache_.find(dir) == cache_.end()) {
+ dir_sizes.emplace(dir, 0);
+ }
+ }
+ }
+
+ Status s;
+ for (auto& dir_size : dir_sizes) {
+ s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ WriteLock lock(&cache_mutex_);
+ for (const auto& dir : dirs) {
+ auto& v = cache_[dir];
+ v.ref++;
+ auto dir_size = dir_sizes.find(dir);
+ if (dir_size != dir_sizes.end()) {
+ v.size = dir_size->second;
+ }
+ }
+ return s;
+}
+
+void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
+ const std::vector<std::string>& directories) {
+ std::vector<std::string> dirs;
+ dirs.reserve(directories.size());
+ for (auto& dir : directories) {
+ dirs.emplace_back(RemoveTrailingSlash(dir));
+ }
+
+ WriteLock lock(&cache_mutex_);
+ for (const auto& dir : dirs) {
+ auto it = cache_.find(dir);
+ if (it != cache_.end() && !(--(it->second.ref))) {
+ cache_.erase(it);
+ }
+ }
+}
+
+size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
+ int fd) {
+ std::string dir = fname.substr(0, fname.find_last_of("/"));
+ if (dir.empty()) {
+ dir = "/";
+ }
+ {
+ ReadLock lock(&cache_mutex_);
+ auto it = cache_.find(dir);
+ if (it != cache_.end()) {
+ return it->second.size;
+ }
+ }
+ return get_logical_block_size_of_fd_(fd);
+}
+#endif
+
+Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
+ size_t* size) {
+ int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
+ if (fd == -1) {
+ close(fd);
+ return Status::IOError("Cannot open directory " + directory);
+ }
+ *size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
+ close(fd);
+ return Status::OK();
+}
+
+size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
+#ifdef OS_LINUX
+ struct stat buf;
+ int result = fstat(fd, &buf);
+ if (result == -1) {
+ return kDefaultPageSize;
+ }
+ if (major(buf.st_dev) == 0) {
+ // Unnamed devices (e.g. non-device mounts), reserved as null device number.
+ // These don't have an entry in /sys/dev/block/. Return a sensible default.
+ return kDefaultPageSize;
+ }
+
+ // Reading queue/logical_block_size does not require special permissions.
+ const int kBufferSize = 100;
+ char path[kBufferSize];
+ char real_path[PATH_MAX + 1];
+ snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
+ minor(buf.st_dev));
+ if (realpath(path, real_path) == nullptr) {
+ return kDefaultPageSize;
+ }
+ std::string device_dir(real_path);
+ if (!device_dir.empty() && device_dir.back() == '/') {
+ device_dir.pop_back();
+ }
+ // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
+ // and nvme0n1 have it.
+ // $ ls -al '/sys/dev/block/8:3'
+ // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
+ // ../../block/sda/sda3
+ // $ ls -al '/sys/dev/block/259:4'
+ // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
+ // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
+ size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
+ if (parent_end == std::string::npos) {
+ return kDefaultPageSize;
+ }
+ size_t parent_begin = device_dir.rfind('/', parent_end - 1);
+ if (parent_begin == std::string::npos) {
+ return kDefaultPageSize;
+ }
+ std::string parent =
+ device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
+ std::string child = device_dir.substr(parent_end + 1, std::string::npos);
+ if (parent != "block" &&
+ (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
+ device_dir = device_dir.substr(0, parent_end);
+ }
+ std::string fname = device_dir + "/queue/logical_block_size";
+ FILE* fp;
+ size_t size = 0;
+ fp = fopen(fname.c_str(), "r");
+ if (fp != nullptr) {
+ char* line = nullptr;
+ size_t len = 0;
+ if (getline(&line, &len, fp) != -1) {
+ sscanf(line, "%zu", &size);
+ }
+ free(line);
+ fclose(fp);
+ }
+ if (size != 0 && (size & (size - 1)) == 0) {
+ return size;
+ }
+#endif
+ (void)fd;
+ return kDefaultPageSize;
+}
+
+/*
+ * PosixRandomAccessFile
+ *
+ * pread() based random-access
+ */
+PosixRandomAccessFile::PosixRandomAccessFile(
+ const std::string& fname, int fd, size_t logical_block_size,
+ const EnvOptions& options
+#if defined(ROCKSDB_IOURING_PRESENT)
+ ,
+ ThreadLocalPtr* thread_local_io_urings
+#endif
+ )
+ : filename_(fname),
+ fd_(fd),
+ use_direct_io_(options.use_direct_reads),
+ logical_sector_size_(logical_block_size)
+#if defined(ROCKSDB_IOURING_PRESENT)
+ ,
+ thread_local_io_urings_(thread_local_io_urings)
+#endif
+{
+ assert(!options.use_direct_reads || !options.use_mmap_reads);
+ assert(!options.use_mmap_reads);
+}
+
+PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
+
+IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/, Slice* result,
+ char* scratch,
+ IODebugContext* /*dbg*/) const {
+ if (use_direct_io()) {
+ assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+ }
+ IOStatus s;
+ ssize_t r = -1;
+ size_t left = n;
+ char* ptr = scratch;
+ while (left > 0) {
+ r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+ if (r <= 0) {
+ if (r == -1 && errno == EINTR) {
+ continue;
+ }
+ break;
+ }
+ ptr += r;
+ offset += r;
+ left -= r;
+ if (use_direct_io() &&
+ r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
+ // Bytes reads don't fill sectors. Should only happen at the end
+ // of the file.
+ break;
+ }
+ }
+ if (r < 0) {
+ // An error: return a non-ok status
+ s = IOError("While pread offset " + std::to_string(offset) + " len " +
+ std::to_string(n),
+ filename_, errno);
+ }
+ *result = Slice(scratch, (r < 0) ? 0 : n - left);
+ return s;
+}
+
+IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ if (use_direct_io()) {
+ for (size_t i = 0; i < num_reqs; i++) {
+ assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
+ }
+ }
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+ struct io_uring* iu = nullptr;
+ if (thread_local_io_urings_) {
+ iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+ if (iu == nullptr) {
+ iu = CreateIOUring();
+ if (iu != nullptr) {
+ thread_local_io_urings_->Reset(iu);
+ }
+ }
+ }
+
+ // Init failed, platform doesn't support io_uring. Fall back to
+ // serialized reads
+ if (iu == nullptr) {
+ return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
+ }
+
+ IOStatus ios = IOStatus::OK();
+
+ struct WrappedReadRequest {
+ FSReadRequest* req;
+ struct iovec iov;
+ size_t finished_len;
+ explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
+ };
+
+ autovector<WrappedReadRequest, 32> req_wraps;
+ autovector<WrappedReadRequest*, 4> incomplete_rq_list;
+ std::unordered_set<WrappedReadRequest*> wrap_cache;
+
+ for (size_t i = 0; i < num_reqs; i++) {
+ req_wraps.emplace_back(&reqs[i]);
+ }
+
+ size_t reqs_off = 0;
+ while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
+ size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
+
+ // If requests exceed depth, split it into batches
+ if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth;
+
+ assert(incomplete_rq_list.size() <= this_reqs);
+ for (size_t i = 0; i < this_reqs; i++) {
+ WrappedReadRequest* rep_to_submit;
+ if (i < incomplete_rq_list.size()) {
+ rep_to_submit = incomplete_rq_list[i];
+ } else {
+ rep_to_submit = &req_wraps[reqs_off++];
+ }
+ assert(rep_to_submit->req->len > rep_to_submit->finished_len);
+ rep_to_submit->iov.iov_base =
+ rep_to_submit->req->scratch + rep_to_submit->finished_len;
+ rep_to_submit->iov.iov_len =
+ rep_to_submit->req->len - rep_to_submit->finished_len;
+
+ struct io_uring_sqe* sqe;
+ sqe = io_uring_get_sqe(iu);
+ io_uring_prep_readv(
+ sqe, fd_, &rep_to_submit->iov, 1,
+ rep_to_submit->req->offset + rep_to_submit->finished_len);
+ io_uring_sqe_set_data(sqe, rep_to_submit);
+ wrap_cache.emplace(rep_to_submit);
+ }
+ incomplete_rq_list.clear();
+
+ ssize_t ret =
+ io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
+ TEST_SYNC_POINT_CALLBACK(
+ "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+ &ret);
+ TEST_SYNC_POINT_CALLBACK(
+ "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+ iu);
+
+ if (static_cast<size_t>(ret) != this_reqs) {
+ fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
+ // If error happens and we submitted fewer than expected, it is an
+ // exception case and we don't retry here. We should still consume
+ // what is is submitted in the ring.
+ for (ssize_t i = 0; i < ret; i++) {
+ struct io_uring_cqe* cqe = nullptr;
+ io_uring_wait_cqe(iu, &cqe);
+ if (cqe != nullptr) {
+ io_uring_cqe_seen(iu, cqe);
+ }
+ }
+ return IOStatus::IOError("io_uring_submit_and_wait() requested " +
+ std::to_string(this_reqs) + " but returned " +
+ std::to_string(ret));
+ }
+
+ for (size_t i = 0; i < this_reqs; i++) {
+ struct io_uring_cqe* cqe = nullptr;
+ WrappedReadRequest* req_wrap;
+
+ // We could use the peek variant here, but this seems safer in terms
+ // of our initial wait not reaping all completions
+ ret = io_uring_wait_cqe(iu, &cqe);
+ TEST_SYNC_POINT_CALLBACK(
+ "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
+ if (ret) {
+ ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
+ std::to_string(ret));
+
+ if (cqe != nullptr) {
+ io_uring_cqe_seen(iu, cqe);
+ }
+ continue;
+ }
+
+ req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
+ // Reset cqe data to catch any stray reuse of it
+ static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+ // Check that we got a valid unique cqe data
+ auto wrap_check = wrap_cache.find(req_wrap);
+ if (wrap_check == wrap_cache.end()) {
+ fprintf(stderr,
+ "PosixRandomAccessFile::MultiRead: "
+ "Bad cqe data from IO uring - %p\n",
+ req_wrap);
+ port::PrintStack();
+ ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
+ std::to_string((uint64_t)req_wrap));
+ continue;
+ }
+ wrap_cache.erase(wrap_check);
+
+ FSReadRequest* req = req_wrap->req;
+ size_t bytes_read = 0;
+ bool read_again = false;
+ UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
+ false /*async_read*/, use_direct_io(),
+ GetRequiredBufferAlignment(), req_wrap->finished_len, req,
+ bytes_read, read_again);
+ int32_t res = cqe->res;
+ if (res >= 0) {
+ if (bytes_read == 0) {
+ if (read_again) {
+ Slice tmp_slice;
+ req->status =
+ Read(req->offset + req_wrap->finished_len,
+ req->len - req_wrap->finished_len, options, &tmp_slice,
+ req->scratch + req_wrap->finished_len, dbg);
+ req->result =
+ Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
+ }
+ // else It means EOF so no need to do anything.
+ } else if (bytes_read < req_wrap->iov.iov_len) {
+ incomplete_rq_list.push_back(req_wrap);
+ }
+ }
+ io_uring_cqe_seen(iu, cqe);
+ }
+ wrap_cache.clear();
+ }
+ return ios;
+#else
+ return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
+#endif
+}
+
+IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ if (!use_direct_io()) {
+ ssize_t r = 0;
+#ifdef OS_LINUX
+ r = readahead(fd_, offset, n);
+#endif
+#ifdef OS_MACOSX
+ radvisory advice;
+ advice.ra_offset = static_cast<off_t>(offset);
+ advice.ra_count = static_cast<int>(n);
+ r = fcntl(fd_, F_RDADVISE, &advice);
+#endif
+ if (r == -1) {
+ s = IOError("While prefetching offset " + std::to_string(offset) +
+ " len " + std::to_string(n),
+ filename_, errno);
+ }
+ }
+ return s;
+}
+
+#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
+size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+ return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
+}
+#endif
+
+void PosixRandomAccessFile::Hint(AccessPattern pattern) {
+ if (use_direct_io()) {
+ return;
+ }
+ switch (pattern) {
+ case kNormal:
+ Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
+ break;
+ case kRandom:
+ Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
+ break;
+ case kSequential:
+ Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+ break;
+ case kWillNeed:
+ Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
+ break;
+ case kWontNeed:
+ Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+}
+
+IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+ if (use_direct_io()) {
+ return IOStatus::OK();
+ }
+#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
+ return IOStatus::OK();
+#else
+ // free OS pages
+ int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+ if (ret == 0) {
+ return IOStatus::OK();
+ }
+ return IOError("While fadvise NotNeeded offset " + std::to_string(offset) +
+ " len " + std::to_string(length),
+ filename_, errno);
+#endif
+}
+
+IOStatus PosixRandomAccessFile::ReadAsync(
+ FSReadRequest& req, const IOOptions& /*opts*/,
+ std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+ void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) {
+ if (use_direct_io()) {
+ assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(req.len, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment()));
+ }
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+ // io_uring_queue_init.
+ struct io_uring* iu = nullptr;
+ if (thread_local_io_urings_) {
+ iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+ if (iu == nullptr) {
+ iu = CreateIOUring();
+ if (iu != nullptr) {
+ thread_local_io_urings_->Reset(iu);
+ }
+ }
+ }
+
+ // Init failed, platform doesn't support io_uring.
+ if (iu == nullptr) {
+ return IOStatus::NotSupported("ReadAsync");
+ }
+
+ // Allocate io_handle.
+ IOHandleDeleter deletefn = [](void* args) -> void {
+ delete (static_cast<Posix_IOHandle*>(args));
+ args = nullptr;
+ };
+
+ // Initialize Posix_IOHandle.
+ Posix_IOHandle* posix_handle =
+ new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch,
+ use_direct_io(), GetRequiredBufferAlignment());
+ posix_handle->iov.iov_base = req.scratch;
+ posix_handle->iov.iov_len = req.len;
+
+ *io_handle = static_cast<void*>(posix_handle);
+ *del_fn = deletefn;
+
+ // Step 3: io_uring_sqe_set_data
+ struct io_uring_sqe* sqe;
+ sqe = io_uring_get_sqe(iu);
+
+ io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov,
+ /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset);
+
+ // Sets sqe->user_data to posix_handle.
+ io_uring_sqe_set_data(sqe, posix_handle);
+
+ // Step 4: io_uring_submit
+ ssize_t ret = io_uring_submit(iu);
+ if (ret < 0) {
+ fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
+ return IOStatus::IOError("io_uring_submit() requested but returned " +
+ std::to_string(ret));
+ }
+ return IOStatus::OK();
+#else
+ (void)req;
+ (void)cb;
+ (void)cb_arg;
+ (void)io_handle;
+ (void)del_fn;
+ return IOStatus::NotSupported("ReadAsync");
+#endif
+}
+
+/*
+ * PosixMmapReadableFile
+ *
+ * mmap() based random-access
+ */
+// base[0,length-1] contains the mmapped contents of the file.
+PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
+ const std::string& fname,
+ void* base, size_t length,
+ const EnvOptions& options)
+ : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
+#ifdef NDEBUG
+ (void)options;
+#endif
+ fd_ = fd_ + 0; // suppress the warning for used variables
+ assert(options.use_mmap_reads);
+ assert(!options.use_direct_reads);
+}
+
+PosixMmapReadableFile::~PosixMmapReadableFile() {
+ int ret = munmap(mmapped_region_, length_);
+ if (ret != 0) {
+ fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
+ mmapped_region_, length_);
+ }
+ close(fd_);
+}
+
+IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/, Slice* result,
+ char* /*scratch*/,
+ IODebugContext* /*dbg*/) const {
+ IOStatus s;
+ if (offset > length_) {
+ *result = Slice();
+ return IOError("While mmap read offset " + std::to_string(offset) +
+ " larger than file length " + std::to_string(length_),
+ filename_, EINVAL);
+ } else if (offset + n > length_) {
+ n = static_cast<size_t>(length_ - offset);
+ }
+ *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+ return s;
+}
+
+void PosixMmapReadableFile::Hint(AccessPattern pattern) {
+ switch (pattern) {
+ case kNormal:
+ Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
+ break;
+ case kRandom:
+ Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
+ break;
+ case kSequential:
+ Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
+ break;
+ case kWillNeed:
+ Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
+ break;
+ case kWontNeed:
+ Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+}
+
+IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
+ return IOStatus::OK();
+#else
+ // free OS pages
+ int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+ if (ret == 0) {
+ return IOStatus::OK();
+ }
+ return IOError("While fadvise not needed. Offset " + std::to_string(offset) +
+ " len" + std::to_string(length),
+ filename_, errno);
+#endif
+}
+
+/*
+ * PosixMmapFile
+ *
+ * We preallocate up to an extra megabyte and use memcpy to append new
+ * data to the file. This is safe since we either properly close the
+ * file before reading from it, or for log files, the reading code
+ * knows enough to skip zero suffixes.
+ */
+IOStatus PosixMmapFile::UnmapCurrentRegion() {
+ TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
+ if (base_ != nullptr) {
+ int munmap_status = munmap(base_, limit_ - base_);
+ if (munmap_status != 0) {
+ return IOError("While munmap", filename_, munmap_status);
+ }
+ file_offset_ += limit_ - base_;
+ base_ = nullptr;
+ limit_ = nullptr;
+ last_sync_ = nullptr;
+ dst_ = nullptr;
+
+ // Increase the amount we map the next time, but capped at 1MB
+ if (map_size_ < (1 << 20)) {
+ map_size_ *= 2;
+ }
+ }
+ return IOStatus::OK();
+}
+
+IOStatus PosixMmapFile::MapNewRegion() {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ assert(base_ == nullptr);
+ TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
+ // we can't fallocate with FALLOC_FL_KEEP_SIZE here
+ if (allow_fallocate_) {
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
+ if (alloc_status != 0) {
+ // fallback to posix_fallocate
+ alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
+ }
+ if (alloc_status != 0) {
+ return IOStatus::IOError("Error allocating space to file : " + filename_ +
+ "Error : " + errnoStr(alloc_status).c_str());
+ }
+ }
+
+ TEST_KILL_RANDOM("PosixMmapFile::Append:1");
+ void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+ file_offset_);
+ if (ptr == MAP_FAILED) {
+ return IOStatus::IOError("MMap failed on " + filename_);
+ }
+ TEST_KILL_RANDOM("PosixMmapFile::Append:2");
+
+ base_ = reinterpret_cast<char*>(ptr);
+ limit_ = base_ + map_size_;
+ dst_ = base_;
+ last_sync_ = base_;
+ return IOStatus::OK();
+#else
+ return IOStatus::NotSupported("This platform doesn't support fallocate()");
+#endif
+}
+
+IOStatus PosixMmapFile::Msync() {
+ if (dst_ == last_sync_) {
+ return IOStatus::OK();
+ }
+ // Find the beginnings of the pages that contain the first and last
+ // bytes to be synced.
+ size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+ size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+ last_sync_ = dst_;
+ TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
+ if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+ return IOError("While msync", filename_, errno);
+ }
+ return IOStatus::OK();
+}
+
+PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
+ const EnvOptions& options)
+ : filename_(fname),
+ fd_(fd),
+ page_size_(page_size),
+ map_size_(Roundup(65536, page_size)),
+ base_(nullptr),
+ limit_(nullptr),
+ dst_(nullptr),
+ last_sync_(nullptr),
+ file_offset_(0) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ allow_fallocate_ = options.allow_fallocate;
+ fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#else
+ (void)options;
+#endif
+ assert((page_size & (page_size - 1)) == 0);
+ assert(options.use_mmap_writes);
+ assert(!options.use_direct_writes);
+}
+
+PosixMmapFile::~PosixMmapFile() {
+ if (fd_ >= 0) {
+ IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
+ s.PermitUncheckedError();
+ }
+}
+
+IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ const char* src = data.data();
+ size_t left = data.size();
+ while (left > 0) {
+ assert(base_ <= dst_);
+ assert(dst_ <= limit_);
+ size_t avail = limit_ - dst_;
+ if (avail == 0) {
+ IOStatus s = UnmapCurrentRegion();
+ if (!s.ok()) {
+ return s;
+ }
+ s = MapNewRegion();
+ if (!s.ok()) {
+ return s;
+ }
+ TEST_KILL_RANDOM("PosixMmapFile::Append:0");
+ }
+
+ size_t n = (left <= avail) ? left : avail;
+ assert(dst_);
+ memcpy(dst_, src, n);
+ dst_ += n;
+ src += n;
+ left -= n;
+ }
+ return IOStatus::OK();
+}
+
+IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ size_t unused = limit_ - dst_;
+
+ s = UnmapCurrentRegion();
+ if (!s.ok()) {
+ s = IOError("While closing mmapped file", filename_, errno);
+ } else if (unused > 0) {
+ // Trim the extra space at the end of the file
+ if (ftruncate(fd_, file_offset_ - unused) < 0) {
+ s = IOError("While ftruncating mmaped file", filename_, errno);
+ }
+ }
+
+ if (close(fd_) < 0) {
+ if (s.ok()) {
+ s = IOError("While closing mmapped file", filename_, errno);
+ }
+ }
+
+ fd_ = -1;
+ base_ = nullptr;
+ limit_ = nullptr;
+ return s;
+}
+
+IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+ if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fdatasync(fd_) < 0) {
+ return IOError("While fdatasync mmapped file", filename_, errno);
+ }
+#endif // HAVE_FULLFSYNC
+
+ return Msync();
+}
+
+/**
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+ if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fsync(fd_) < 0) {
+ return IOError("While fsync mmaped file", filename_, errno);
+ }
+#endif // HAVE_FULLFSYNC
+
+ return Msync();
+}
+
+/**
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ size_t used = dst_ - base_;
+ return file_offset_ + used;
+}
+
+IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
+ return IOStatus::OK();
+#else
+ // free OS pages
+ int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+ if (ret == 0) {
+ return IOStatus::OK();
+ }
+ return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
+#endif
+}
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
+ int alloc_status = 0;
+ if (allow_fallocate_) {
+ alloc_status =
+ fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+ static_cast<off_t>(offset), static_cast<off_t>(len));
+ }
+ if (alloc_status == 0) {
+ return IOStatus::OK();
+ } else {
+ return IOError("While fallocate offset " + std::to_string(offset) +
+ " len " + std::to_string(len),
+ filename_, errno);
+ }
+}
+#endif
+
+/*
+ * PosixWritableFile
+ *
+ * Use posix write to write data to a file.
+ */
+PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
+ size_t logical_block_size,
+ const EnvOptions& options)
+ : FSWritableFile(options),
+ filename_(fname),
+ use_direct_io_(options.use_direct_writes),
+ fd_(fd),
+ filesize_(0),
+ logical_sector_size_(logical_block_size) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ allow_fallocate_ = options.allow_fallocate;
+ fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#endif
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+ sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
+#endif // ROCKSDB_RANGESYNC_PRESENT
+ assert(!options.use_mmap_writes);
+}
+
+PosixWritableFile::~PosixWritableFile() {
+ if (fd_ >= 0) {
+ IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
+ s.PermitUncheckedError();
+ }
+}
+
+IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ if (use_direct_io()) {
+ assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
+ }
+ const char* src = data.data();
+ size_t nbytes = data.size();
+
+ if (!PosixWrite(fd_, src, nbytes)) {
+ return IOError("While appending to file", filename_, errno);
+ }
+
+ filesize_ += nbytes;
+ return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ if (use_direct_io()) {
+ assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
+ assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
+ }
+ assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ const char* src = data.data();
+ size_t nbytes = data.size();
+ if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+ return IOError("While pwrite to file at offset " + std::to_string(offset),
+ filename_, errno);
+ }
+ filesize_ = offset + nbytes;
+ return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ int r = ftruncate(fd_, size);
+ if (r < 0) {
+ s = IOError("While ftruncate file to size " + std::to_string(size),
+ filename_, errno);
+ } else {
+ filesize_ = size;
+ }
+ return s;
+}
+
+IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ size_t block_size;
+ size_t last_allocated_block;
+ GetPreallocationStatus(&block_size, &last_allocated_block);
+ TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
+ if (last_allocated_block > 0) {
+ // trim the extra space preallocated at the end of the file
+ // NOTE(ljin): we probably don't want to surface failure as an IOError,
+ // but it will be nice to log these errors.
+ int dummy __attribute__((__unused__));
+ dummy = ftruncate(fd_, filesize_);
+#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE)
+ // in some file systems, ftruncate only trims trailing space if the
+ // new file size is smaller than the current size. Calling fallocate
+ // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
+ // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
+ // filesystems:
+ // XFS (since Linux 2.6.38)
+ // ext4 (since Linux 3.0)
+ // Btrfs (since Linux 3.7)
+ // tmpfs (since Linux 3.5)
+ // We ignore error since failure of this operation does not affect
+ // correctness.
+ struct stat file_stats;
+ int result = fstat(fd_, &file_stats);
+ // After ftruncate, we check whether ftruncate has the correct behavior.
+ // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
+ if (result == 0 &&
+ (file_stats.st_size + file_stats.st_blksize - 1) /
+ file_stats.st_blksize !=
+ file_stats.st_blocks / (file_stats.st_blksize / 512)) {
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ if (allow_fallocate_) {
+ fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
+ block_size * last_allocated_block - filesize_);
+ }
+ }
+#endif
+ }
+
+ if (close(fd_) < 0) {
+ s = IOError("While closing file after writing", filename_, errno);
+ }
+ fd_ = -1;
+ return s;
+}
+
+// write out the cached data to the OS cache
+IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+ if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fdatasync(fd_) < 0) {
+ return IOError("While fdatasync", filename_, errno);
+ }
+#endif // HAVE_FULLFSYNC
+ return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+ if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fsync(fd_) < 0) {
+ return IOError("While fsync", filename_, errno);
+ }
+#endif // HAVE_FULLFSYNC
+ return IOStatus::OK();
+}
+
+bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
+
+uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ return filesize_;
+}
+
+void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+#ifdef OS_LINUX
+// Suppress Valgrind "Unimplemented functionality" error.
+#ifndef ROCKSDB_VALGRIND_RUN
+ if (hint == write_hint_) {
+ return;
+ }
+ if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
+ write_hint_ = hint;
+ }
+#else
+ (void)hint;
+#endif // ROCKSDB_VALGRIND_RUN
+#else
+ (void)hint;
+#endif // OS_LINUX
+}
+
+IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
+ if (use_direct_io()) {
+ return IOStatus::OK();
+ }
+#ifndef OS_LINUX
+ (void)offset;
+ (void)length;
+ return IOStatus::OK();
+#else
+ // free OS pages
+ int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+ if (ret == 0) {
+ return IOStatus::OK();
+ }
+ return IOError("While fadvise NotNeeded", filename_, errno);
+#endif
+}
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ int alloc_status = 0;
+ if (allow_fallocate_) {
+ alloc_status =
+ fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+ static_cast<off_t>(offset), static_cast<off_t>(len));
+ }
+ if (alloc_status == 0) {
+ return IOStatus::OK();
+ } else {
+ return IOError("While fallocate offset " + std::to_string(offset) +
+ " len " + std::to_string(len),
+ filename_, errno);
+ }
+}
+#endif
+
+IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+ const IOOptions& opts,
+ IODebugContext* dbg) {
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+ assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+ if (sync_file_range_supported_) {
+ int ret;
+ if (strict_bytes_per_sync_) {
+ // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
+ // that spans all bytes written so far tells `sync_file_range` to wait for
+ // any outstanding writeback requests to finish before issuing a new one.
+ ret =
+ sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
+ SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
+ } else {
+ ret = sync_file_range(fd_, static_cast<off_t>(offset),
+ static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
+ }
+ if (ret != 0) {
+ return IOError("While sync_file_range returned " + std::to_string(ret),
+ filename_, errno);
+ }
+ return IOStatus::OK();
+ }
+#endif // ROCKSDB_RANGESYNC_PRESENT
+ return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
+}
+
+#ifdef OS_LINUX
+size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
+ return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
+}
+#endif
+
+/*
+ * PosixRandomRWFile
+ */
+
+PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
+ const EnvOptions& /*options*/)
+ : filename_(fname), fd_(fd) {}
+
+PosixRandomRWFile::~PosixRandomRWFile() {
+ if (fd_ >= 0) {
+ IOStatus s = Close(IOOptions(), nullptr);
+ s.PermitUncheckedError();
+ }
+}
+
+IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ const char* src = data.data();
+ size_t nbytes = data.size();
+ if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+ return IOError("While write random read/write file at offset " +
+ std::to_string(offset),
+ filename_, errno);
+ }
+
+ return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/, Slice* result,
+ char* scratch, IODebugContext* /*dbg*/) const {
+ size_t left = n;
+ char* ptr = scratch;
+ while (left > 0) {
+ ssize_t done = pread(fd_, ptr, left, offset);
+ if (done < 0) {
+ // error while reading from file
+ if (errno == EINTR) {
+ // read was interrupted, try again.
+ continue;
+ }
+ return IOError("While reading random read/write file offset " +
+ std::to_string(offset) + " len " + std::to_string(n),
+ filename_, errno);
+ } else if (done == 0) {
+ // Nothing more to read
+ break;
+ }
+
+ // Read `done` bytes
+ ptr += done;
+ offset += done;
+ left -= done;
+ }
+
+ *result = Slice(scratch, n - left);
+ return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+ if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fdatasync(fd_) < 0) {
+ return IOError("While fdatasync random read/write file", filename_, errno);
+ }
+#endif // HAVE_FULLFSYNC
+ return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+ if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fsync(fd_) < 0) {
+ return IOError("While fsync random read/write file", filename_, errno);
+ }
+#endif // HAVE_FULLFSYNC
+ return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ if (close(fd_) < 0) {
+ return IOError("While close random read/write file", filename_, errno);
+ }
+ fd_ = -1;
+ return IOStatus::OK();
+}
+
+PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
+ // TODO should have error handling though not much we can do...
+ munmap(this->base_, length_);
+}
+
+/*
+ * PosixDirectory
+ */
+#if !defined(BTRFS_SUPER_MAGIC)
+// The magic number for BTRFS is fixed, if it's not defined, define it here
+#define BTRFS_SUPER_MAGIC 0x9123683E
+#endif
+PosixDirectory::PosixDirectory(int fd, const std::string& directory_name)
+ : fd_(fd), directory_name_(directory_name) {
+ is_btrfs_ = false;
+#ifdef OS_LINUX
+ struct statfs buf;
+ int ret = fstatfs(fd, &buf);
+ is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>(
+ BTRFS_SUPER_MAGIC));
+#endif
+}
+
+PosixDirectory::~PosixDirectory() {
+ if (fd_ >= 0) {
+ IOStatus s = PosixDirectory::Close(IOOptions(), nullptr);
+ s.PermitUncheckedError();
+ }
+}
+
+IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
+ return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
+}
+
+// Users who want the file entries synced in Directory project must call a
+// Fsync or FsyncWithDirOptions function before Close
+IOStatus PosixDirectory::Close(const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s = IOStatus::OK();
+ if (close(fd_) < 0) {
+ s = IOError("While closing directory ", directory_name_, errno);
+ } else {
+ fd_ = -1;
+ }
+ return s;
+}
+
+IOStatus PosixDirectory::FsyncWithDirOptions(
+ const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
+ const DirFsyncOptions& dir_fsync_options) {
+ assert(fd_ >= 0); // Check use after close
+ IOStatus s = IOStatus::OK();
+#ifndef OS_AIX
+ if (is_btrfs_) {
+ // skip dir fsync for new file creation, which is not needed for btrfs
+ if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
+ return s;
+ }
+ // skip dir fsync for renaming file, only need to sync new file
+ if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
+ std::string new_name = dir_fsync_options.renamed_new_name;
+ assert(!new_name.empty());
+ int fd;
+ do {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ fd = open(new_name.c_str(), O_RDONLY);
+ } while (fd < 0 && errno == EINTR);
+ if (fd < 0) {
+ s = IOError("While open renaming file", new_name, errno);
+ } else if (fsync(fd) < 0) {
+ s = IOError("While fsync renaming file", new_name, errno);
+ }
+ if (close(fd) < 0) {
+ s = IOError("While closing file after fsync", new_name, errno);
+ }
+ return s;
+ }
+ // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
+ }
+
+ // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed
+ // in either the de-construction or the close function, data must have been
+ // fsync-ed before de-construction and close is called
+#ifdef HAVE_FULLFSYNC
+ // btrfs is a Linux file system, while currently F_FULLFSYNC is available on
+ // Mac OS.
+ assert(!is_btrfs_);
+ if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) {
+ return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
+ }
+#else // HAVE_FULLFSYNC
+ if (fd_ != -1 && fsync(fd_) == -1) {
+ s = IOError("While fsync", "a directory", errno);
+ }
+#endif // HAVE_FULLFSYNC
+#endif // OS_AIX
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/env/io_posix.h b/src/rocksdb/env/io_posix.h
new file mode 100644
index 000000000..f129668ea
--- /dev/null
+++ b/src/rocksdb/env/io_posix.h
@@ -0,0 +1,523 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <errno.h>
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#include <sys/uio.h>
+#endif
+#include <unistd.h>
+
+#include <atomic>
+#include <functional>
+#include <map>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+#include "util/thread_local.h"
+
+// For non linux platform, the following macros are used only as place
+// holder.
+#if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX)
+#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
+#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
+#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
+#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
+#define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */
+
+#define POSIX_MADV_NORMAL 0 /* [MC1] no further special treatment */
+#define POSIX_MADV_RANDOM 1 /* [MC1] expect random page refs */
+#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
+#define POSIX_MADV_WILLNEED 3 /* [MC1] will need these pages */
+#define POSIX_MADV_DONTNEED 4 /* [MC1] don't need these pages */
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+std::string IOErrorMsg(const std::string& context,
+ const std::string& file_name);
+// file_name can be left empty if it is not unkown.
+IOStatus IOError(const std::string& context, const std::string& file_name,
+ int err_number);
+
+class PosixHelper {
+ public:
+ static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
+ static size_t GetLogicalBlockSizeOfFd(int fd);
+ static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
+ size_t* size);
+};
+
+/*
+ * DirectIOHelper
+ */
+inline bool IsSectorAligned(const size_t off, size_t sector_size) {
+ assert((sector_size & (sector_size - 1)) == 0);
+ return (off & (sector_size - 1)) == 0;
+}
+
+#ifndef NDEBUG
+inline bool IsSectorAligned(const void* ptr, size_t sector_size) {
+ return uintptr_t(ptr) % sector_size == 0;
+}
+#endif
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+struct Posix_IOHandle {
+ Posix_IOHandle(struct io_uring* _iu,
+ std::function<void(const FSReadRequest&, void*)> _cb,
+ void* _cb_arg, uint64_t _offset, size_t _len, char* _scratch,
+ bool _use_direct_io, size_t _alignment)
+ : iu(_iu),
+ cb(_cb),
+ cb_arg(_cb_arg),
+ offset(_offset),
+ len(_len),
+ scratch(_scratch),
+ use_direct_io(_use_direct_io),
+ alignment(_alignment),
+ is_finished(false),
+ req_count(0) {}
+
+ struct iovec iov;
+ struct io_uring* iu;
+ std::function<void(const FSReadRequest&, void*)> cb;
+ void* cb_arg;
+ uint64_t offset;
+ size_t len;
+ char* scratch;
+ bool use_direct_io;
+ size_t alignment;
+ bool is_finished;
+ // req_count is used by AbortIO API to keep track of number of requests.
+ uint32_t req_count;
+};
+
+inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
+ size_t len, size_t iov_len, bool async_read,
+ bool use_direct_io, size_t alignment,
+ size_t& finished_len, FSReadRequest* req,
+ size_t& bytes_read, bool& read_again) {
+ read_again = false;
+ if (cqe->res < 0) {
+ req->result = Slice(req->scratch, 0);
+ req->status = IOError("Req failed", file_name, cqe->res);
+ } else {
+ bytes_read = static_cast<size_t>(cqe->res);
+ TEST_SYNC_POINT_CALLBACK("UpdateResults::io_uring_result", &bytes_read);
+ if (bytes_read == iov_len) {
+ req->result = Slice(req->scratch, req->len);
+ req->status = IOStatus::OK();
+ } else if (bytes_read == 0) {
+ /// cqe->res == 0 can means EOF, or can mean partial results. See
+ // comment
+ // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
+ // Fall back to pread in this case.
+ if (use_direct_io && !IsSectorAligned(finished_len, alignment)) {
+ // Bytes reads don't fill sectors. Should only happen at the end
+ // of the file.
+ req->result = Slice(req->scratch, finished_len);
+ req->status = IOStatus::OK();
+ } else {
+ if (async_read) {
+ // No bytes read. It can means EOF. In case of partial results, it's
+ // caller responsibility to call read/readasync again.
+ req->result = Slice(req->scratch, 0);
+ req->status = IOStatus::OK();
+ } else {
+ read_again = true;
+ }
+ }
+ } else if (bytes_read < iov_len) {
+ assert(bytes_read > 0);
+ if (async_read) {
+ req->result = Slice(req->scratch, bytes_read);
+ req->status = IOStatus::OK();
+ } else {
+ assert(bytes_read + finished_len < len);
+ finished_len += bytes_read;
+ }
+ } else {
+ req->result = Slice(req->scratch, 0);
+ req->status = IOError("Req returned more bytes than requested", file_name,
+ cqe->res);
+ }
+ }
+#ifdef NDEBUG
+ (void)len;
+#endif
+}
+#endif
+
+#ifdef OS_LINUX
+// Files under a specific directory have the same logical block size.
+// This class caches the logical block size for the specified directories to
+// save the CPU cost of computing the size.
+// Safe for concurrent access from multiple threads without any external
+// synchronization.
+class LogicalBlockSizeCache {
+ public:
+ LogicalBlockSizeCache(
+ std::function<size_t(int)> get_logical_block_size_of_fd =
+ PosixHelper::GetLogicalBlockSizeOfFd,
+ std::function<Status(const std::string&, size_t*)>
+ get_logical_block_size_of_directory =
+ PosixHelper::GetLogicalBlockSizeOfDirectory)
+ : get_logical_block_size_of_fd_(get_logical_block_size_of_fd),
+ get_logical_block_size_of_directory_(
+ get_logical_block_size_of_directory) {}
+
+ // Takes the following actions:
+ // 1. Increases reference count of the directories;
+ // 2. If the directory's logical block size is not cached,
+ // compute the buffer size and cache the result.
+ Status RefAndCacheLogicalBlockSize(
+ const std::vector<std::string>& directories);
+
+ // Takes the following actions:
+ // 1. Decreases reference count of the directories;
+ // 2. If the reference count of a directory reaches 0, remove the directory
+ // from the cache.
+ void UnrefAndTryRemoveCachedLogicalBlockSize(
+ const std::vector<std::string>& directories);
+
+ // Returns the logical block size for the file.
+ //
+ // If the file is under a cached directory, return the cached size.
+ // Otherwise, the size is computed.
+ size_t GetLogicalBlockSize(const std::string& fname, int fd);
+
+ int GetRefCount(const std::string& dir) {
+ ReadLock lock(&cache_mutex_);
+ auto it = cache_.find(dir);
+ if (it == cache_.end()) {
+ return 0;
+ }
+ return it->second.ref;
+ }
+
+ size_t Size() const { return cache_.size(); }
+
+ bool Contains(const std::string& dir) {
+ ReadLock lock(&cache_mutex_);
+ return cache_.find(dir) != cache_.end();
+ }
+
+ private:
+ struct CacheValue {
+ CacheValue() : size(0), ref(0) {}
+
+ // Logical block size of the directory.
+ size_t size;
+ // Reference count of the directory.
+ int ref;
+ };
+
+ std::function<size_t(int)> get_logical_block_size_of_fd_;
+ std::function<Status(const std::string&, size_t*)>
+ get_logical_block_size_of_directory_;
+
+ std::map<std::string, CacheValue> cache_;
+ port::RWMutex cache_mutex_;
+};
+#endif
+
+class PosixSequentialFile : public FSSequentialFile {
+ private:
+ std::string filename_;
+ FILE* file_;
+ int fd_;
+ bool use_direct_io_;
+ size_t logical_sector_size_;
+
+ public:
+ PosixSequentialFile(const std::string& fname, FILE* file, int fd,
+ size_t logical_block_size, const EnvOptions& options);
+ virtual ~PosixSequentialFile();
+
+ virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result,
+ char* scratch, IODebugContext* dbg) override;
+ virtual IOStatus PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& opts, Slice* result,
+ char* scratch, IODebugContext* dbg) override;
+ virtual IOStatus Skip(uint64_t n) override;
+ virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+ virtual bool use_direct_io() const override { return use_direct_io_; }
+ virtual size_t GetRequiredBufferAlignment() const override {
+ return logical_sector_size_;
+ }
+};
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+// io_uring instance queue depth
+const unsigned int kIoUringDepth = 256;
+
+inline void DeleteIOUring(void* p) {
+ struct io_uring* iu = static_cast<struct io_uring*>(p);
+ delete iu;
+}
+
+inline struct io_uring* CreateIOUring() {
+ struct io_uring* new_io_uring = new struct io_uring;
+ int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0);
+ if (ret) {
+ delete new_io_uring;
+ new_io_uring = nullptr;
+ }
+ return new_io_uring;
+}
+#endif // defined(ROCKSDB_IOURING_PRESENT)
+
+class PosixRandomAccessFile : public FSRandomAccessFile {
+ protected:
+ std::string filename_;
+ int fd_;
+ bool use_direct_io_;
+ size_t logical_sector_size_;
+#if defined(ROCKSDB_IOURING_PRESENT)
+ ThreadLocalPtr* thread_local_io_urings_;
+#endif
+
+ public:
+ PosixRandomAccessFile(const std::string& fname, int fd,
+ size_t logical_block_size, const EnvOptions& options
+#if defined(ROCKSDB_IOURING_PRESENT)
+ ,
+ ThreadLocalPtr* thread_local_io_urings
+#endif
+ );
+ virtual ~PosixRandomAccessFile();
+
+ virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts,
+ IODebugContext* dbg) override;
+
+#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
+ virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+#endif
+ virtual void Hint(AccessPattern pattern) override;
+ virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+ virtual bool use_direct_io() const override { return use_direct_io_; }
+ virtual size_t GetRequiredBufferAlignment() const override {
+ return logical_sector_size_;
+ }
+ // EXPERIMENTAL
+ virtual IOStatus ReadAsync(
+ FSReadRequest& req, const IOOptions& opts,
+ std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+ void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override;
+};
+
+class PosixWritableFile : public FSWritableFile {
+ protected:
+ const std::string filename_;
+ const bool use_direct_io_;
+ int fd_;
+ uint64_t filesize_;
+ size_t logical_sector_size_;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ bool allow_fallocate_;
+ bool fallocate_with_keep_size_;
+#endif
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+ // Even if the syscall is present, the filesystem may still not properly
+ // support it, so we need to do a dynamic check too.
+ bool sync_file_range_supported_;
+#endif // ROCKSDB_RANGESYNC_PRESENT
+
+ public:
+ explicit PosixWritableFile(const std::string& fname, int fd,
+ size_t logical_block_size,
+ const EnvOptions& options);
+ virtual ~PosixWritableFile();
+
+ // Need to implement this so the file is truncated correctly
+ // with direct I/O
+ virtual IOStatus Truncate(uint64_t size, const IOOptions& opts,
+ IODebugContext* dbg) override;
+ virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+ IODebugContext* dbg) override;
+ virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) override {
+ return Append(data, opts, dbg);
+ }
+ virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& opts,
+ IODebugContext* dbg) override;
+ virtual IOStatus PositionedAppend(
+ const Slice& data, uint64_t offset, const IOOptions& opts,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) override {
+ return PositionedAppend(data, offset, opts, dbg);
+ }
+ virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual bool IsSyncThreadSafe() const override;
+ virtual bool use_direct_io() const override { return use_direct_io_; }
+ virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
+ virtual uint64_t GetFileSize(const IOOptions& opts,
+ IODebugContext* dbg) override;
+ virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+ virtual size_t GetRequiredBufferAlignment() const override {
+ return logical_sector_size_;
+ }
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ virtual IOStatus Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& opts,
+ IODebugContext* dbg) override;
+#endif
+ virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+ const IOOptions& opts,
+ IODebugContext* dbg) override;
+#ifdef OS_LINUX
+ virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+#endif
+};
+
+// mmap() based random-access
+class PosixMmapReadableFile : public FSRandomAccessFile {
+ private:
+ int fd_;
+ std::string filename_;
+ void* mmapped_region_;
+ size_t length_;
+
+ public:
+ PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
+ size_t length, const EnvOptions& options);
+ virtual ~PosixMmapReadableFile();
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result,
+ char* scratch, IODebugContext* dbg) const override;
+ void Hint(AccessPattern pattern) override;
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+};
+
+class PosixMmapFile : public FSWritableFile {
+ private:
+ std::string filename_;
+ int fd_;
+ size_t page_size_;
+ size_t map_size_; // How much extra memory to map at a time
+ char* base_; // The mapped region
+ char* limit_; // Limit of the mapped region
+ char* dst_; // Where to write next (in range [base_,limit_])
+ char* last_sync_; // Where have we synced up to
+ uint64_t file_offset_; // Offset of base_ in file
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ bool allow_fallocate_; // If false, fallocate calls are bypassed
+ bool fallocate_with_keep_size_;
+#endif
+
+ // Roundup x to a multiple of y
+ static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
+
+ size_t TruncateToPageBoundary(size_t s) {
+ s -= (s & (page_size_ - 1));
+ assert((s % page_size_) == 0);
+ return s;
+ }
+
+ IOStatus MapNewRegion();
+ IOStatus UnmapCurrentRegion();
+ IOStatus Msync();
+
+ public:
+ PosixMmapFile(const std::string& fname, int fd, size_t page_size,
+ const EnvOptions& options);
+ ~PosixMmapFile();
+
+ // Means Close() will properly take care of truncate
+ // and it does not need any additional information
+ virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) override {
+ return IOStatus::OK();
+ }
+ virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+ IODebugContext* dbg) override;
+ virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) override {
+ return Append(data, opts, dbg);
+ }
+ virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual uint64_t GetFileSize(const IOOptions& opts,
+ IODebugContext* dbg) override;
+ virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+ virtual IOStatus Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& opts,
+ IODebugContext* dbg) override;
+#endif
+};
+
+class PosixRandomRWFile : public FSRandomRWFile {
+ public:
+ explicit PosixRandomRWFile(const std::string& fname, int fd,
+ const EnvOptions& options);
+ virtual ~PosixRandomRWFile();
+
+ virtual IOStatus Write(uint64_t offset, const Slice& data,
+ const IOOptions& opts, IODebugContext* dbg) override;
+
+ virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+ virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+
+ private:
+ const std::string filename_;
+ int fd_;
+};
+
+struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {
+ PosixMemoryMappedFileBuffer(void* _base, size_t _length)
+ : MemoryMappedFileBuffer(_base, _length) {}
+ virtual ~PosixMemoryMappedFileBuffer();
+};
+
+class PosixDirectory : public FSDirectory {
+ public:
+ explicit PosixDirectory(int fd, const std::string& directory_name);
+ ~PosixDirectory();
+ virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+
+ virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+
+ virtual IOStatus FsyncWithDirOptions(
+ const IOOptions&, IODebugContext*,
+ const DirFsyncOptions& dir_fsync_options) override;
+
+ private:
+ int fd_;
+ bool is_btrfs_;
+ const std::string directory_name_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/io_posix_test.cc b/src/rocksdb/env/io_posix_test.cc
new file mode 100644
index 000000000..81ce50587
--- /dev/null
+++ b/src/rocksdb/env/io_posix_test.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2020-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef ROCKSDB_LIB_IO_POSIX
+#include "env/io_posix.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef OS_LINUX
+class LogicalBlockSizeCacheTest : public testing::Test {};
+
+// Tests the caching behavior.
+TEST_F(LogicalBlockSizeCacheTest, Cache) {
+ int ncall = 0;
+ auto get_fd_block_size = [&](int fd) {
+ ncall++;
+ return fd;
+ };
+ std::map<std::string, int> dir_fds{
+ {"/", 0},
+ {"/db", 1},
+ {"/db1", 2},
+ {"/db2", 3},
+ };
+ auto get_dir_block_size = [&](const std::string& dir, size_t* size) {
+ ncall++;
+ *size = dir_fds[dir];
+ return Status::OK();
+ };
+ LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size);
+ ASSERT_EQ(0, ncall);
+ ASSERT_EQ(0, cache.Size());
+
+ ASSERT_EQ(6, cache.GetLogicalBlockSize("/sst", 6));
+ ASSERT_EQ(1, ncall);
+ ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7));
+ ASSERT_EQ(2, ncall);
+ ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8));
+ ASSERT_EQ(3, ncall);
+
+ ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/", "/db1/", "/db2"}));
+ ASSERT_EQ(3, cache.Size());
+ ASSERT_TRUE(cache.Contains("/"));
+ ASSERT_TRUE(cache.Contains("/db1"));
+ ASSERT_TRUE(cache.Contains("/db2"));
+ ASSERT_EQ(6, ncall);
+ // Block size for / is cached.
+ ASSERT_EQ(0, cache.GetLogicalBlockSize("/sst", 6));
+ ASSERT_EQ(6, ncall);
+ // No cached size for /db.
+ ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7));
+ ASSERT_EQ(7, ncall);
+ ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8));
+ ASSERT_EQ(8, ncall);
+ // Block size for /db1 is cached.
+ ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst1", 4));
+ ASSERT_EQ(8, ncall);
+ ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst2", 5));
+ ASSERT_EQ(8, ncall);
+ // Block size for /db2 is cached.
+ ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst1", 6));
+ ASSERT_EQ(8, ncall);
+ ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst2", 7));
+ ASSERT_EQ(8, ncall);
+
+ ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+ ASSERT_EQ(4, cache.Size());
+ ASSERT_TRUE(cache.Contains("/"));
+ ASSERT_TRUE(cache.Contains("/db1"));
+ ASSERT_TRUE(cache.Contains("/db2"));
+ ASSERT_TRUE(cache.Contains("/db"));
+
+ ASSERT_EQ(9, ncall);
+ // Block size for /db is cached.
+ ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst1", 7));
+ ASSERT_EQ(9, ncall);
+ ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst2", 8));
+ ASSERT_EQ(9, ncall);
+}
+
+// Tests the reference counting behavior.
+TEST_F(LogicalBlockSizeCacheTest, Ref) {
+ int ncall = 0;
+ auto get_fd_block_size = [&](int fd) {
+ ncall++;
+ return fd;
+ };
+ std::map<std::string, int> dir_fds{
+ {"/db", 0},
+ };
+ auto get_dir_block_size = [&](const std::string& dir, size_t* size) {
+ ncall++;
+ *size = dir_fds[dir];
+ return Status::OK();
+ };
+ LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size);
+
+ ASSERT_EQ(0, ncall);
+
+ ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1));
+ ASSERT_EQ(1, ncall);
+
+ ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+ ASSERT_EQ(2, ncall);
+ ASSERT_EQ(1, cache.GetRefCount("/db"));
+ // Block size for /db is cached. Ref count = 1.
+ ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst1", 1));
+ ASSERT_EQ(2, ncall);
+
+ // Ref count = 2, but won't recompute the cached buffer size.
+ ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+ ASSERT_EQ(2, cache.GetRefCount("/db"));
+ ASSERT_EQ(2, ncall);
+
+ // Ref count = 1.
+ cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"});
+ ASSERT_EQ(1, cache.GetRefCount("/db"));
+ // Block size for /db is still cached.
+ ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst2", 1));
+ ASSERT_EQ(2, ncall);
+
+ // Ref count = 0 and cached buffer size for /db is removed.
+ cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"});
+ ASSERT_EQ(0, cache.Size());
+ ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1));
+ ASSERT_EQ(3, ncall);
+}
+#endif
+
+} // namespace ROCKSDB_NAMESPACE
+#endif
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/mock_env.cc b/src/rocksdb/env/mock_env.cc
new file mode 100644
index 000000000..bfa7dc2f4
--- /dev/null
+++ b/src/rocksdb/env/mock_env.cc
@@ -0,0 +1,1070 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "env/mock_env.h"
+
+#include <algorithm>
+#include <chrono>
+
+#include "env/emulated_clock.h"
+#include "file/filename.h"
+#include "port/sys_time.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/options_type.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+int64_t MaybeCurrentTime(const std::shared_ptr<SystemClock>& clock) {
+ int64_t time = 1337346000; // arbitrary fallback default
+ clock->GetCurrentTime(&time).PermitUncheckedError();
+ return time;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> time_elapse_type_info = {
+#ifndef ROCKSDB_LITE
+ {"time_elapse_only_sleep",
+ {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kCompareNever,
+ [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto clock = static_cast<EmulatedSystemClock*>(addr);
+ clock->SetTimeElapseOnlySleep(ParseBoolean("", value));
+ return Status::OK();
+ },
+ [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto clock = static_cast<const EmulatedSystemClock*>(addr);
+ *value = clock->IsTimeElapseOnlySleep() ? "true" : "false";
+ return Status::OK();
+ },
+ nullptr}},
+#endif // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> mock_sleep_type_info = {
+#ifndef ROCKSDB_LITE
+ {"mock_sleep",
+ {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kCompareNever,
+ [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto clock = static_cast<EmulatedSystemClock*>(addr);
+ clock->SetMockSleep(ParseBoolean("", value));
+ return Status::OK();
+ },
+ [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto clock = static_cast<const EmulatedSystemClock*>(addr);
+ *value = clock->IsMockSleepEnabled() ? "true" : "false";
+ return Status::OK();
+ },
+ nullptr}},
+#endif // ROCKSDB_LITE
+};
+} // namespace
+
+EmulatedSystemClock::EmulatedSystemClock(
+ const std::shared_ptr<SystemClock>& base, bool time_elapse_only_sleep)
+ : SystemClockWrapper(base),
+ maybe_starting_time_(MaybeCurrentTime(base)),
+ time_elapse_only_sleep_(time_elapse_only_sleep),
+ no_slowdown_(time_elapse_only_sleep) {
+ RegisterOptions("", this, &time_elapse_type_info);
+ RegisterOptions("", this, &mock_sleep_type_info);
+}
+
+class MemFile {
+ public:
+ explicit MemFile(SystemClock* clock, const std::string& fn,
+ bool _is_lock_file = false)
+ : clock_(clock),
+ fn_(fn),
+ refs_(0),
+ is_lock_file_(_is_lock_file),
+ locked_(false),
+ size_(0),
+ modified_time_(Now()),
+ rnd_(Lower32of64(GetSliceNPHash64(fn))),
+ fsynced_bytes_(0) {}
+ // No copying allowed.
+ MemFile(const MemFile&) = delete;
+ void operator=(const MemFile&) = delete;
+
+ void Ref() {
+ MutexLock lock(&mutex_);
+ ++refs_;
+ }
+
+ bool is_lock_file() const { return is_lock_file_; }
+
+ bool Lock() {
+ assert(is_lock_file_);
+ MutexLock lock(&mutex_);
+ if (locked_) {
+ return false;
+ } else {
+ locked_ = true;
+ return true;
+ }
+ }
+
+ void Unlock() {
+ assert(is_lock_file_);
+ MutexLock lock(&mutex_);
+ locked_ = false;
+ }
+
+ void Unref() {
+ bool do_delete = false;
+ {
+ MutexLock lock(&mutex_);
+ --refs_;
+ assert(refs_ >= 0);
+ if (refs_ <= 0) {
+ do_delete = true;
+ }
+ }
+
+ if (do_delete) {
+ delete this;
+ }
+ }
+
+ uint64_t Size() const { return size_; }
+
+ void Truncate(size_t size, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ MutexLock lock(&mutex_);
+ if (size < size_) {
+ data_.resize(size);
+ size_ = size;
+ }
+ }
+
+ void CorruptBuffer() {
+ if (fsynced_bytes_ >= size_) {
+ return;
+ }
+ uint64_t buffered_bytes = size_ - fsynced_bytes_;
+ uint64_t start =
+ fsynced_bytes_ + rnd_.Uniform(static_cast<int>(buffered_bytes));
+ uint64_t end = std::min(start + 512, size_.load());
+ MutexLock lock(&mutex_);
+ for (uint64_t pos = start; pos < end; ++pos) {
+ data_[static_cast<size_t>(pos)] = static_cast<char>(rnd_.Uniform(256));
+ }
+ }
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+ Slice* result, char* scratch, IODebugContext* /*dbg*/) const {
+ {
+ IOStatus s;
+ TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s);
+ if (!s.ok()) {
+ // with sync point only
+ *result = Slice();
+ return s;
+ }
+ }
+ MutexLock lock(&mutex_);
+ const uint64_t available = Size() - std::min(Size(), offset);
+ size_t offset_ = static_cast<size_t>(offset);
+ if (n > available) {
+ n = static_cast<size_t>(available);
+ }
+ if (n == 0) {
+ *result = Slice();
+ return IOStatus::OK();
+ }
+ if (scratch) {
+ memcpy(scratch, &(data_[offset_]), n);
+ *result = Slice(scratch, n);
+ } else {
+ *result = Slice(&(data_[offset_]), n);
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus Write(uint64_t offset, const Slice& data,
+ const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
+ MutexLock lock(&mutex_);
+ size_t offset_ = static_cast<size_t>(offset);
+ if (offset + data.size() > data_.size()) {
+ data_.resize(offset_ + data.size());
+ }
+ data_.replace(offset_, data.size(), data.data(), data.size());
+ size_ = data_.size();
+ modified_time_ = Now();
+ return IOStatus::OK();
+ }
+
+ IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ MutexLock lock(&mutex_);
+ data_.append(data.data(), data.size());
+ size_ = data_.size();
+ modified_time_ = Now();
+ return IOStatus::OK();
+ }
+
+ IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
+ fsynced_bytes_ = size_.load();
+ return IOStatus::OK();
+ }
+
+ uint64_t ModifiedTime() const { return modified_time_; }
+
+ private:
+ uint64_t Now() {
+ int64_t unix_time = 0;
+ auto s = clock_->GetCurrentTime(&unix_time);
+ assert(s.ok());
+ return static_cast<uint64_t>(unix_time);
+ }
+
+ // Private since only Unref() should be used to delete it.
+ ~MemFile() { assert(refs_ == 0); }
+
+ SystemClock* clock_;
+ const std::string fn_;
+ mutable port::Mutex mutex_;
+ int refs_;
+ bool is_lock_file_;
+ bool locked_;
+
+ // Data written into this file, all bytes before fsynced_bytes are
+ // persistent.
+ std::string data_;
+ std::atomic<uint64_t> size_;
+ std::atomic<uint64_t> modified_time_;
+
+ Random rnd_;
+ std::atomic<uint64_t> fsynced_bytes_;
+};
+
+namespace {
+
+class MockSequentialFile : public FSSequentialFile {
+ public:
+ explicit MockSequentialFile(MemFile* file, const FileOptions& opts)
+ : file_(file),
+ use_direct_io_(opts.use_direct_reads),
+ use_mmap_read_(opts.use_mmap_reads),
+ pos_(0) {
+ file_->Ref();
+ }
+
+ ~MockSequentialFile() override { file_->Unref(); }
+
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override {
+ IOStatus s = file_->Read(pos_, n, options, result,
+ (use_mmap_read_) ? nullptr : scratch, dbg);
+ if (s.ok()) {
+ pos_ += result->size();
+ }
+ return s;
+ }
+
+ bool use_direct_io() const override { return use_direct_io_; }
+ IOStatus Skip(uint64_t n) override {
+ if (pos_ > file_->Size()) {
+ return IOStatus::IOError("pos_ > file_->Size()");
+ }
+ const uint64_t available = file_->Size() - pos_;
+ if (n > available) {
+ n = available;
+ }
+ pos_ += static_cast<size_t>(n);
+ return IOStatus::OK();
+ }
+
+ private:
+ MemFile* file_;
+ bool use_direct_io_;
+ bool use_mmap_read_;
+ size_t pos_;
+};
+
+class MockRandomAccessFile : public FSRandomAccessFile {
+ public:
+ explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts)
+ : file_(file),
+ use_direct_io_(opts.use_direct_reads),
+ use_mmap_read_(opts.use_mmap_reads) {
+ file_->Ref();
+ }
+
+ ~MockRandomAccessFile() override { file_->Unref(); }
+
+ bool use_direct_io() const override { return use_direct_io_; }
+
+ IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return IOStatus::OK();
+ }
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ if (use_mmap_read_) {
+ return file_->Read(offset, n, options, result, nullptr, dbg);
+ } else {
+ return file_->Read(offset, n, options, result, scratch, dbg);
+ }
+ }
+
+ private:
+ MemFile* file_;
+ bool use_direct_io_;
+ bool use_mmap_read_;
+};
+
+class MockRandomRWFile : public FSRandomRWFile {
+ public:
+ explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); }
+
+ ~MockRandomRWFile() override { file_->Unref(); }
+
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return file_->Write(offset, data, options, dbg);
+ }
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ return file_->Read(offset, n, options, result, scratch, dbg);
+ }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return file_->Fsync(options, dbg);
+ }
+
+ IOStatus Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return IOStatus::OK();
+ }
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ return file_->Fsync(options, dbg);
+ }
+
+ private:
+ MemFile* file_;
+};
+
+class MockWritableFile : public FSWritableFile {
+ public:
+ MockWritableFile(MemFile* file, const FileOptions& opts)
+ : file_(file),
+ use_direct_io_(opts.use_direct_writes),
+ rate_limiter_(opts.rate_limiter) {
+ file_->Ref();
+ }
+
+ ~MockWritableFile() override { file_->Unref(); }
+
+ bool use_direct_io() const override { return false && use_direct_io_; }
+
+ using FSWritableFile::Append;
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ size_t bytes_written = 0;
+ while (bytes_written < data.size()) {
+ auto bytes = RequestToken(data.size() - bytes_written);
+ IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes),
+ options, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ bytes_written += bytes;
+ }
+ return IOStatus::OK();
+ }
+
+ using FSWritableFile::PositionedAppend;
+ IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/,
+ const IOOptions& options,
+ IODebugContext* dbg) override {
+ assert(use_direct_io_);
+ return Append(data, options, dbg);
+ }
+
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override {
+ file_->Truncate(static_cast<size_t>(size), options, dbg);
+ return IOStatus::OK();
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ return file_->Fsync(options, dbg);
+ }
+
+ IOStatus Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return IOStatus::OK();
+ }
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ return file_->Fsync(options, dbg);
+ }
+
+ uint64_t GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return file_->Size();
+ }
+
+ private:
+ inline size_t RequestToken(size_t bytes) {
+ if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
+ bytes = std::min(
+ bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+ rate_limiter_->Request(bytes, io_priority_);
+ }
+ return bytes;
+ }
+
+ MemFile* file_;
+ bool use_direct_io_;
+ RateLimiter* rate_limiter_;
+};
+
+class MockEnvDirectory : public FSDirectory {
+ public:
+ IOStatus Fsync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return IOStatus::OK();
+ }
+
+ IOStatus Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override {
+ return IOStatus::OK();
+ }
+};
+
+class MockEnvFileLock : public FileLock {
+ public:
+ explicit MockEnvFileLock(const std::string& fname) : fname_(fname) {}
+
+ std::string FileName() const { return fname_; }
+
+ private:
+ const std::string fname_;
+};
+
+class TestMemLogger : public Logger {
+ private:
+ std::unique_ptr<FSWritableFile> file_;
+ std::atomic_size_t log_size_;
+ static const uint64_t flush_every_seconds_ = 5;
+ std::atomic_uint_fast64_t last_flush_micros_;
+ SystemClock* clock_;
+ IOOptions options_;
+ IODebugContext* dbg_;
+ std::atomic<bool> flush_pending_;
+
+ public:
+ TestMemLogger(std::unique_ptr<FSWritableFile> f, SystemClock* clock,
+ const IOOptions& options, IODebugContext* dbg,
+ const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+ : Logger(log_level),
+ file_(std::move(f)),
+ log_size_(0),
+ last_flush_micros_(0),
+ clock_(clock),
+ options_(options),
+ dbg_(dbg),
+ flush_pending_(false) {}
+ ~TestMemLogger() override {}
+
+ void Flush() override {
+ if (flush_pending_) {
+ flush_pending_ = false;
+ }
+ last_flush_micros_ = clock_->NowMicros();
+ }
+
+ using Logger::Logv;
+ void Logv(const char* format, va_list ap) override {
+ // We try twice: the first time with a fixed-size stack allocated buffer,
+ // and the second time with a much larger dynamically allocated buffer.
+ char buffer[500];
+ for (int iter = 0; iter < 2; iter++) {
+ char* base;
+ int bufsize;
+ if (iter == 0) {
+ bufsize = sizeof(buffer);
+ base = buffer;
+ } else {
+ bufsize = 30000;
+ base = new char[bufsize];
+ }
+ char* p = base;
+ char* limit = base + bufsize;
+
+ port::TimeVal now_tv;
+ port::GetTimeOfDay(&now_tv, nullptr);
+ const time_t seconds = now_tv.tv_sec;
+ struct tm t;
+ memset(&t, 0, sizeof(t));
+ struct tm* ret __attribute__((__unused__));
+ ret = port::LocalTimeR(&seconds, &t);
+ assert(ret);
+ p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
+ t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+ t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec));
+
+ // Print the message
+ if (p < limit) {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ p += vsnprintf(p, limit - p, format, backup_ap);
+ va_end(backup_ap);
+ }
+
+ // Truncate to available space if necessary
+ if (p >= limit) {
+ if (iter == 0) {
+ continue; // Try again with larger buffer
+ } else {
+ p = limit - 1;
+ }
+ }
+
+ // Add newline if necessary
+ if (p == base || p[-1] != '\n') {
+ *p++ = '\n';
+ }
+
+ assert(p <= limit);
+ const size_t write_size = p - base;
+
+ Status s = file_->Append(Slice(base, write_size), options_, dbg_);
+ if (s.ok()) {
+ flush_pending_ = true;
+ log_size_ += write_size;
+ }
+ uint64_t now_micros =
+ static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
+ if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+ flush_pending_ = false;
+ last_flush_micros_ = now_micros;
+ }
+ if (base != buffer) {
+ delete[] base;
+ }
+ break;
+ }
+ }
+ size_t GetLogFileSize() const override { return log_size_; }
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> mock_fs_type_info = {
+#ifndef ROCKSDB_LITE
+ {"supports_direct_io",
+ {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+} // namespace
+
+MockFileSystem::MockFileSystem(const std::shared_ptr<SystemClock>& clock,
+ bool supports_direct_io)
+ : system_clock_(clock), supports_direct_io_(supports_direct_io) {
+ clock_ = system_clock_.get();
+ RegisterOptions("", &supports_direct_io_, &mock_fs_type_info);
+}
+
+MockFileSystem::~MockFileSystem() {
+ for (auto i = file_map_.begin(); i != file_map_.end(); ++i) {
+ i->second->Unref();
+ }
+}
+
+Status MockFileSystem::PrepareOptions(const ConfigOptions& options) {
+ Status s = FileSystem::PrepareOptions(options);
+ if (s.ok() && system_clock_ == SystemClock::Default()) {
+ system_clock_ = options.env->GetSystemClock();
+ clock_ = system_clock_.get();
+ }
+ return s;
+}
+
+IOStatus MockFileSystem::GetAbsolutePath(const std::string& db_path,
+ const IOOptions& /*options*/,
+ std::string* output_path,
+ IODebugContext* /*dbg*/) {
+ *output_path = NormalizeMockPath(db_path);
+ if (output_path->at(0) != '/') {
+ return IOStatus::NotSupported("GetAbsolutePath");
+ } else {
+ return IOStatus::OK();
+ }
+}
+
+std::string MockFileSystem::NormalizeMockPath(const std::string& path) {
+ std::string p = NormalizePath(path);
+ if (p.back() == kFilePathSeparator && p.size() > 1) {
+ p.pop_back();
+ }
+ return p;
+}
+
+// Partial implementation of the FileSystem interface.
+IOStatus MockFileSystem::NewSequentialFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) == file_map_.end()) {
+ *result = nullptr;
+ return IOStatus::PathNotFound(fn);
+ }
+ auto* f = file_map_[fn];
+ if (f->is_lock_file()) {
+ return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+ } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+ return IOStatus::NotSupported("Direct I/O Not Supported");
+ } else {
+ result->reset(new MockSequentialFile(f, file_opts));
+ return IOStatus::OK();
+ }
+}
+
+IOStatus MockFileSystem::NewRandomAccessFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) == file_map_.end()) {
+ *result = nullptr;
+ return IOStatus::PathNotFound(fn);
+ }
+ auto* f = file_map_[fn];
+ if (f->is_lock_file()) {
+ return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+ } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+ return IOStatus::NotSupported("Direct I/O Not Supported");
+ } else {
+ result->reset(new MockRandomAccessFile(f, file_opts));
+ return IOStatus::OK();
+ }
+}
+
+IOStatus MockFileSystem::NewRandomRWFile(
+ const std::string& fname, const FileOptions& /*file_opts*/,
+ std::unique_ptr<FSRandomRWFile>* result, IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) == file_map_.end()) {
+ *result = nullptr;
+ return IOStatus::PathNotFound(fn);
+ }
+ auto* f = file_map_[fn];
+ if (f->is_lock_file()) {
+ return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+ }
+ result->reset(new MockRandomRWFile(f));
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::ReuseWritableFile(
+ const std::string& fname, const std::string& old_fname,
+ const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) {
+ auto s = RenameFile(old_fname, fname, IOOptions(), dbg);
+ if (!s.ok()) {
+ return s;
+ } else {
+ result->reset();
+ return NewWritableFile(fname, options, result, dbg);
+ }
+}
+
+IOStatus MockFileSystem::NewWritableFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) != file_map_.end()) {
+ DeleteFileInternal(fn);
+ }
+ MemFile* file = new MemFile(clock_, fn, false);
+ file->Ref();
+ file_map_[fn] = file;
+ if (file_opts.use_direct_writes && !supports_direct_io_) {
+ return IOStatus::NotSupported("Direct I/O Not Supported");
+ } else {
+ result->reset(new MockWritableFile(file, file_opts));
+ return IOStatus::OK();
+ }
+}
+
+IOStatus MockFileSystem::ReopenWritableFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ MemFile* file = nullptr;
+ if (file_map_.find(fn) == file_map_.end()) {
+ file = new MemFile(clock_, fn, false);
+ // Only take a reference when we create the file objectt
+ file->Ref();
+ file_map_[fn] = file;
+ } else {
+ file = file_map_[fn];
+ }
+ if (file_opts.use_direct_writes && !supports_direct_io_) {
+ return IOStatus::NotSupported("Direct I/O Not Supported");
+ } else {
+ result->reset(new MockWritableFile(file, file_opts));
+ return IOStatus::OK();
+ }
+}
+
+IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/,
+ const IOOptions& /*io_opts*/,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* /*dbg*/) {
+ result->reset(new MockEnvDirectory());
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::FileExists(const std::string& fname,
+ const IOOptions& /*io_opts*/,
+ IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) != file_map_.end()) {
+ // File exists
+ return IOStatus::OK();
+ }
+ // Now also check if fn exists as a dir
+ for (const auto& iter : file_map_) {
+ const std::string& filename = iter.first;
+ if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' &&
+ Slice(filename).starts_with(Slice(fn))) {
+ return IOStatus::OK();
+ }
+ }
+ return IOStatus::NotFound();
+}
+
+bool MockFileSystem::GetChildrenInternal(const std::string& dir,
+ std::vector<std::string>* result) {
+ auto d = NormalizeMockPath(dir);
+ bool found_dir = false;
+ result->clear();
+ for (const auto& iter : file_map_) {
+ const std::string& filename = iter.first;
+
+ if (filename == d) {
+ found_dir = true;
+ } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
+ Slice(filename).starts_with(Slice(d))) {
+ found_dir = true;
+ size_t next_slash = filename.find('/', d.size() + 1);
+ if (next_slash != std::string::npos) {
+ result->push_back(
+ filename.substr(d.size() + 1, next_slash - d.size() - 1));
+ } else {
+ result->push_back(filename.substr(d.size() + 1));
+ }
+ }
+ }
+ result->erase(std::unique(result->begin(), result->end()), result->end());
+ return found_dir;
+}
+
+IOStatus MockFileSystem::GetChildren(const std::string& dir,
+ const IOOptions& /*options*/,
+ std::vector<std::string>* result,
+ IODebugContext* /*dbg*/) {
+ MutexLock lock(&mutex_);
+ bool found_dir = GetChildrenInternal(dir, result);
+#ifndef __clang_analyzer__
+ return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir);
+#else
+ return found_dir ? IOStatus::OK() : IOStatus::NotFound();
+#endif
+}
+
+void MockFileSystem::DeleteFileInternal(const std::string& fname) {
+ assert(fname == NormalizeMockPath(fname));
+ const auto& pair = file_map_.find(fname);
+ if (pair != file_map_.end()) {
+ pair->second->Unref();
+ file_map_.erase(fname);
+ }
+}
+
+IOStatus MockFileSystem::DeleteFile(const std::string& fname,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) == file_map_.end()) {
+ return IOStatus::PathNotFound(fn);
+ }
+
+ DeleteFileInternal(fn);
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ auto iter = file_map_.find(fn);
+ if (iter == file_map_.end()) {
+ return IOStatus::PathNotFound(fn);
+ }
+ iter->second->Truncate(size, options, dbg);
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::CreateDir(const std::string& dirname,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ auto dn = NormalizeMockPath(dirname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(dn) == file_map_.end()) {
+ MemFile* file = new MemFile(clock_, dn, false);
+ file->Ref();
+ file_map_[dn] = file;
+ } else {
+ return IOStatus::IOError();
+ }
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ CreateDir(dirname, options, dbg).PermitUncheckedError();
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::DeleteDir(const std::string& dirname,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ auto dir = NormalizeMockPath(dirname);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(dir) == file_map_.end()) {
+ return IOStatus::PathNotFound(dir);
+ } else {
+ std::vector<std::string> children;
+ if (GetChildrenInternal(dir, &children)) {
+ for (const auto& child : children) {
+ DeleteFileInternal(child);
+ }
+ }
+ DeleteFileInternal(dir);
+ return IOStatus::OK();
+ }
+}
+
+IOStatus MockFileSystem::GetFileSize(const std::string& fname,
+ const IOOptions& /*options*/,
+ uint64_t* file_size,
+ IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ TEST_SYNC_POINT_CALLBACK("MockFileSystem::GetFileSize:CheckFileType", &fn);
+ MutexLock lock(&mutex_);
+ auto iter = file_map_.find(fn);
+ if (iter == file_map_.end()) {
+ return IOStatus::PathNotFound(fn);
+ }
+
+ *file_size = iter->second->Size();
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname,
+ const IOOptions& /*options*/,
+ uint64_t* time,
+ IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ auto iter = file_map_.find(fn);
+ if (iter == file_map_.end()) {
+ return IOStatus::PathNotFound(fn);
+ }
+ *time = iter->second->ModifiedTime();
+ return IOStatus::OK();
+}
+
+bool MockFileSystem::RenameFileInternal(const std::string& src,
+ const std::string& dest) {
+ if (file_map_.find(src) == file_map_.end()) {
+ return false;
+ } else {
+ std::vector<std::string> children;
+ if (GetChildrenInternal(src, &children)) {
+ for (const auto& child : children) {
+ RenameFileInternal(src + "/" + child, dest + "/" + child);
+ }
+ }
+ DeleteFileInternal(dest);
+ file_map_[dest] = file_map_[src];
+ file_map_.erase(src);
+ return true;
+ }
+}
+
+IOStatus MockFileSystem::RenameFile(const std::string& src,
+ const std::string& dest,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ auto s = NormalizeMockPath(src);
+ auto t = NormalizeMockPath(dest);
+ MutexLock lock(&mutex_);
+ bool found = RenameFileInternal(s, t);
+ if (!found) {
+ return IOStatus::PathNotFound(s);
+ } else {
+ return IOStatus::OK();
+ }
+}
+
+IOStatus MockFileSystem::LinkFile(const std::string& src,
+ const std::string& dest,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ auto s = NormalizeMockPath(src);
+ auto t = NormalizeMockPath(dest);
+ MutexLock lock(&mutex_);
+ if (file_map_.find(s) == file_map_.end()) {
+ return IOStatus::PathNotFound(s);
+ }
+
+ DeleteFileInternal(t);
+ file_map_[t] = file_map_[s];
+ file_map_[t]->Ref(); // Otherwise it might get deleted when noone uses s
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::NewLogger(const std::string& fname,
+ const IOOptions& io_opts,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ auto iter = file_map_.find(fn);
+ MemFile* file = nullptr;
+ if (iter == file_map_.end()) {
+ file = new MemFile(clock_, fn, false);
+ file->Ref();
+ file_map_[fn] = file;
+ } else {
+ file = iter->second;
+ }
+ std::unique_ptr<FSWritableFile> f(new MockWritableFile(file, FileOptions()));
+ result->reset(new TestMemLogger(std::move(f), clock_, io_opts, dbg));
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::LockFile(const std::string& fname,
+ const IOOptions& /*options*/,
+ FileLock** flock, IODebugContext* /*dbg*/) {
+ auto fn = NormalizeMockPath(fname);
+ {
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) != file_map_.end()) {
+ if (!file_map_[fn]->is_lock_file()) {
+ return IOStatus::InvalidArgument(fname, "Not a lock file.");
+ }
+ if (!file_map_[fn]->Lock()) {
+ return IOStatus::IOError(fn, "lock is already held.");
+ }
+ } else {
+ auto* file = new MemFile(clock_, fn, true);
+ file->Ref();
+ file->Lock();
+ file_map_[fn] = file;
+ }
+ }
+ *flock = new MockEnvFileLock(fn);
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::UnlockFile(FileLock* flock,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ std::string fn = static_cast_with_check<MockEnvFileLock>(flock)->FileName();
+ {
+ MutexLock lock(&mutex_);
+ if (file_map_.find(fn) != file_map_.end()) {
+ if (!file_map_[fn]->is_lock_file()) {
+ return IOStatus::InvalidArgument(fn, "Not a lock file.");
+ }
+ file_map_[fn]->Unlock();
+ }
+ }
+ delete flock;
+ return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/,
+ std::string* path,
+ IODebugContext* /*dbg*/) {
+ *path = "/test";
+ return IOStatus::OK();
+}
+
+Status MockFileSystem::CorruptBuffer(const std::string& fname) {
+ auto fn = NormalizeMockPath(fname);
+ MutexLock lock(&mutex_);
+ auto iter = file_map_.find(fn);
+ if (iter == file_map_.end()) {
+ return Status::IOError(fn, "File not found");
+ }
+ iter->second->CorruptBuffer();
+ return Status::OK();
+}
+
+MockEnv::MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& clock)
+ : CompositeEnvWrapper(env, fs, clock) {}
+
+MockEnv* MockEnv::Create(Env* env) {
+ auto clock =
+ std::make_shared<EmulatedSystemClock>(env->GetSystemClock(), true);
+ return MockEnv::Create(env, clock);
+}
+
+MockEnv* MockEnv::Create(Env* env, const std::shared_ptr<SystemClock>& clock) {
+ auto fs = std::make_shared<MockFileSystem>(clock);
+ return new MockEnv(env, fs, clock);
+}
+
+Status MockEnv::CorruptBuffer(const std::string& fname) {
+ auto mock = static_cast_with_check<MockFileSystem>(GetFileSystem().get());
+ return mock->CorruptBuffer(fname);
+}
+
+#ifndef ROCKSDB_LITE
+// This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv
+Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); }
+
+#else // ROCKSDB_LITE
+
+Env* NewMemEnv(Env* /*base_env*/) { return nullptr; }
+
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/mock_env.h b/src/rocksdb/env/mock_env.h
new file mode 100644
index 000000000..406a31f63
--- /dev/null
+++ b/src/rocksdb/env/mock_env.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+class MemFile;
+class MockFileSystem : public FileSystem {
+ public:
+ explicit MockFileSystem(const std::shared_ptr<SystemClock>& clock,
+ bool supports_direct_io = true);
+ ~MockFileSystem() override;
+
+ static const char* kClassName() { return "MemoryFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+ IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* dbg) override;
+ IOStatus NewRandomAccessFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* r,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomRWFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+ IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/,
+ IODebugContext* /*dbg*/) override;
+ IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) override;
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus Truncate(const std::string& fname, size_t size,
+ const IOOptions& options, IODebugContext* dbg) override;
+ IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+ uint64_t* file_size, IODebugContext* dbg) override;
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) override;
+ IOStatus RenameFile(const std::string& src, const std::string& target,
+ const IOOptions& options, IODebugContext* dbg) override;
+ IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override;
+ IOStatus LockFile(const std::string& fname, const IOOptions& options,
+ FileLock** lock, IODebugContext* dbg) override;
+ IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) override;
+ IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) override;
+ // Get full directory name for this db.
+ IOStatus GetAbsolutePath(const std::string& db_path,
+ const IOOptions& /*options*/,
+ std::string* output_path,
+ IODebugContext* /*dbg*/) override;
+ IOStatus IsDirectory(const std::string& /*path*/,
+ const IOOptions& /*options*/, bool* /*is_dir*/,
+ IODebugContext* /*dgb*/) override {
+ return IOStatus::NotSupported("IsDirectory");
+ }
+
+ Status CorruptBuffer(const std::string& fname);
+ Status PrepareOptions(const ConfigOptions& options) override;
+
+ private:
+ bool RenameFileInternal(const std::string& src, const std::string& dest);
+ void DeleteFileInternal(const std::string& fname);
+ bool GetChildrenInternal(const std::string& fname,
+ std::vector<std::string>* results);
+
+ std::string NormalizeMockPath(const std::string& path);
+
+ private:
+ // Map from filenames to MemFile objects, representing a simple file system.
+ port::Mutex mutex_;
+ std::map<std::string, MemFile*> file_map_; // Protected by mutex_.
+ std::shared_ptr<SystemClock> system_clock_;
+ SystemClock* clock_;
+ bool supports_direct_io_;
+};
+
+class MockEnv : public CompositeEnvWrapper {
+ public:
+ static MockEnv* Create(Env* base);
+ static MockEnv* Create(Env* base, const std::shared_ptr<SystemClock>& clock);
+
+ static const char* kClassName() { return "MockEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ Status CorruptBuffer(const std::string& fname);
+
+ private:
+ MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs,
+ const std::shared_ptr<SystemClock>& clock);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/mock_env_test.cc b/src/rocksdb/env/mock_env_test.cc
new file mode 100644
index 000000000..be174bd73
--- /dev/null
+++ b/src/rocksdb/env/mock_env_test.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include "env/mock_env.h"
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockEnvTest : public testing::Test {
+ public:
+ MockEnv* env_;
+ const EnvOptions soptions_;
+
+ MockEnvTest() : env_(MockEnv::Create(Env::Default())) {}
+ ~MockEnvTest() override { delete env_; }
+};
+
+TEST_F(MockEnvTest, Corrupt) {
+ const std::string kGood = "this is a good string, synced to disk";
+ const std::string kCorrupted = "this part may be corrupted";
+ const std::string kFileName = "/dir/f";
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append(kGood));
+ ASSERT_TRUE(writable_file->GetFileSize() == kGood.size());
+
+ std::string scratch;
+ scratch.resize(kGood.size() + kCorrupted.size() + 16);
+ Slice result;
+ std::unique_ptr<RandomAccessFile> rand_file;
+ ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_));
+ ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+ ASSERT_EQ(result.compare(kGood), 0);
+
+ // Sync + corrupt => no change
+ ASSERT_OK(writable_file->Fsync());
+ ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+ result.clear();
+ ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+ ASSERT_EQ(result.compare(kGood), 0);
+
+ // Add new data and corrupt it
+ ASSERT_OK(writable_file->Append(kCorrupted));
+ ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size());
+ result.clear();
+ ASSERT_OK(
+ rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0])));
+ ASSERT_EQ(result.compare(kCorrupted), 0);
+ // Corrupted
+ ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+ result.clear();
+ ASSERT_OK(
+ rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0])));
+ ASSERT_NE(result.compare(kCorrupted), 0);
+}
+
+TEST_F(MockEnvTest, FakeSleeping) {
+ int64_t now = 0;
+ auto s = env_->GetCurrentTime(&now);
+ ASSERT_OK(s);
+ env_->SleepForMicroseconds(3 * 1000 * 1000);
+ int64_t after_sleep = 0;
+ s = env_->GetCurrentTime(&after_sleep);
+ ASSERT_OK(s);
+ auto delta = after_sleep - now;
+ // this will be true unless test runs for 2 seconds
+ ASSERT_TRUE(delta == 3 || delta == 4);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/unique_id_gen.cc b/src/rocksdb/env/unique_id_gen.cc
new file mode 100644
index 000000000..a1986fa15
--- /dev/null
+++ b/src/rocksdb/env/unique_id_gen.cc
@@ -0,0 +1,164 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "env/unique_id_gen.h"
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <random>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/version.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+struct GenerateRawUniqueIdOpts {
+ Env* env = Env::Default();
+ bool exclude_port_uuid = false;
+ bool exclude_env_details = false;
+ bool exclude_random_device = false;
+};
+
+// Each of these "tracks" below should be sufficient for generating 128 bits
+// of entropy, after hashing the raw bytes. The tracks are separable for
+// testing purposes, but in production we combine as many tracks as possible
+// to ensure quality results even if some environments have degraded
+// capabilities or quality in some APIs.
+//
+// This approach has not been validated for use in cryptography. The goal is
+// generating globally unique values with high probability without coordination
+// between instances.
+//
+// Linux performance: EntropyTrackRandomDevice is much faster than
+// EntropyTrackEnvDetails, which is much faster than EntropyTrackPortUuid.
+
+struct EntropyTrackPortUuid {
+ std::array<char, 36> uuid;
+
+ void Populate(const GenerateRawUniqueIdOpts& opts) {
+ if (opts.exclude_port_uuid) {
+ return;
+ }
+ std::string s;
+ port::GenerateRfcUuid(&s);
+ if (s.size() >= uuid.size()) {
+ std::copy_n(s.begin(), uuid.size(), uuid.begin());
+ }
+ }
+};
+
+struct EntropyTrackEnvDetails {
+ std::array<char, 64> hostname_buf;
+ int64_t process_id;
+ uint64_t thread_id;
+ int64_t unix_time;
+ uint64_t nano_time;
+
+ void Populate(const GenerateRawUniqueIdOpts& opts) {
+ if (opts.exclude_env_details) {
+ return;
+ }
+ opts.env->GetHostName(hostname_buf.data(), hostname_buf.size())
+ .PermitUncheckedError();
+ process_id = port::GetProcessID();
+ thread_id = opts.env->GetThreadID();
+ opts.env->GetCurrentTime(&unix_time).PermitUncheckedError();
+ nano_time = opts.env->NowNanos();
+ }
+};
+
+struct EntropyTrackRandomDevice {
+ using RandType = std::random_device::result_type;
+ static constexpr size_t kNumRandVals =
+ /* generous bits */ 192U / (8U * sizeof(RandType));
+ std::array<RandType, kNumRandVals> rand_vals;
+
+ void Populate(const GenerateRawUniqueIdOpts& opts) {
+ if (opts.exclude_random_device) {
+ return;
+ }
+ std::random_device r;
+ for (auto& val : rand_vals) {
+ val = r();
+ }
+ }
+};
+
+struct Entropy {
+ uint64_t version_identifier;
+ EntropyTrackRandomDevice et1;
+ EntropyTrackEnvDetails et2;
+ EntropyTrackPortUuid et3;
+
+ void Populate(const GenerateRawUniqueIdOpts& opts) {
+ // If we change the format of what goes into the entropy inputs, it's
+ // conceivable there could be a physical collision in the hash input
+ // even though they are logically different. This value should change
+ // if there's a change to the "schema" here, including byte order.
+ version_identifier = (uint64_t{ROCKSDB_MAJOR} << 32) +
+ (uint64_t{ROCKSDB_MINOR} << 16) +
+ uint64_t{ROCKSDB_PATCH};
+ et1.Populate(opts);
+ et2.Populate(opts);
+ et3.Populate(opts);
+ }
+};
+
+void GenerateRawUniqueIdImpl(uint64_t* a, uint64_t* b,
+ const GenerateRawUniqueIdOpts& opts) {
+ Entropy e;
+ std::memset(&e, 0, sizeof(e));
+ e.Populate(opts);
+ Hash2x64(reinterpret_cast<const char*>(&e), sizeof(e), a, b);
+}
+
+} // namespace
+
+void GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid) {
+ GenerateRawUniqueIdOpts opts;
+ opts.exclude_port_uuid = exclude_port_uuid;
+ assert(!opts.exclude_env_details);
+ assert(!opts.exclude_random_device);
+ GenerateRawUniqueIdImpl(a, b, opts);
+}
+
+#ifndef NDEBUG
+void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
+ bool exclude_env_details,
+ bool exclude_random_device) {
+ GenerateRawUniqueIdOpts opts;
+ opts.exclude_port_uuid = exclude_port_uuid;
+ opts.exclude_env_details = exclude_env_details;
+ opts.exclude_random_device = exclude_random_device;
+ GenerateRawUniqueIdImpl(a, b, opts);
+}
+#endif
+
+void SemiStructuredUniqueIdGen::Reset() {
+ saved_process_id_ = port::GetProcessID();
+ GenerateRawUniqueId(&base_upper_, &base_lower_);
+ counter_ = 0;
+}
+
+void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) {
+ if (port::GetProcessID() == saved_process_id_) {
+ // Safe to increment the atomic for guaranteed uniqueness within this
+ // process lifetime. Xor slightly better than +. See
+ // https://github.com/pdillinger/unique_id
+ *lower = base_lower_ ^ counter_.fetch_add(1);
+ *upper = base_upper_;
+ } else {
+ // There must have been a fork() or something. Rather than attempting to
+ // update in a thread-safe way, simply fall back on GenerateRawUniqueId.
+ GenerateRawUniqueId(upper, lower);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/unique_id_gen.h b/src/rocksdb/env/unique_id_gen.h
new file mode 100644
index 000000000..17e71e622
--- /dev/null
+++ b/src/rocksdb/env/unique_id_gen.h
@@ -0,0 +1,71 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// This file is for functions that generate unique identifiers by
+// (at least in part) by extracting novel entropy or sources of uniqueness
+// from the execution environment. (By contrast, random.h is for algorithmic
+// pseudorandomness.)
+//
+// These functions could eventually migrate to public APIs, such as in Env.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Generates a new 128-bit identifier that is universally unique
+// (with high probability) for each call. The result is split into
+// two 64-bit pieces. This function has NOT been validated for use in
+// cryptography.
+//
+// This is used in generating DB session IDs and by Env::GenerateUniqueId
+// (used for DB IDENTITY) if the platform does not provide a generator of
+// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this
+// function is used as a fallback for GenerateRfcUuid, because no need
+// trying it again.)
+void GenerateRawUniqueId(uint64_t* a, uint64_t* b,
+ bool exclude_port_uuid = false);
+
+#ifndef NDEBUG
+// A version of above with options for challenge testing
+void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
+ bool exclude_env_details,
+ bool exclude_random_device);
+#endif
+
+// Generates globally unique ids with lower probability of any collisions
+// vs. each unique id being independently random (GenerateRawUniqueId).
+// We call this "semi-structured" because between different
+// SemiStructuredUniqueIdGen objects, the IDs are separated by random
+// intervals (unstructured), but within a single SemiStructuredUniqueIdGen
+// object, the generated IDs are trivially related (structured). See
+// https://github.com/pdillinger/unique_id for how this improves probability
+// of no collision. In short, if we have n SemiStructuredUniqueIdGen
+// objects each generating m IDs, the first collision is expected at
+// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64,
+// rather than n * m = 2^64 for fully random IDs.
+class SemiStructuredUniqueIdGen {
+ public:
+ // Initializes with random starting state (from GenerateRawUniqueId)
+ SemiStructuredUniqueIdGen() { Reset(); }
+ // Re-initializes, but not thread safe
+ void Reset();
+
+ // Assuming no fork(), `lower` is guaranteed unique from one call
+ // to the next (thread safe).
+ void GenerateNext(uint64_t* upper, uint64_t* lower);
+
+ private:
+ uint64_t base_upper_;
+ uint64_t base_lower_;
+ std::atomic<uint64_t> counter_;
+ int64_t saved_process_id_;
+};
+
+} // namespace ROCKSDB_NAMESPACE