diff options
Diffstat (limited to 'src/rocksdb/env')
26 files changed, 15583 insertions, 0 deletions
diff --git a/src/rocksdb/env/composite_env.cc b/src/rocksdb/env/composite_env.cc new file mode 100644 index 000000000..b93aa9fcb --- /dev/null +++ b/src/rocksdb/env/composite_env.cc @@ -0,0 +1,544 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "env/composite_env_wrapper.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// The CompositeEnvWrapper class provides an interface that is compatible +// with the old monolithic Env API, and an implementation that wraps around +// the new Env that provides threading and other OS related functionality, and +// the new FileSystem API that provides storage functionality. By +// providing the old Env interface, it allows the rest of RocksDB code to +// be agnostic of whether the underlying Env implementation is a monolithic +// Env or an Env + FileSystem. In the former case, the user will specify +// Options::env only, whereas in the latter case, the user will specify +// Options::env and Options::file_system. + +class CompositeSequentialFileWrapper : public SequentialFile { + public: + explicit CompositeSequentialFileWrapper( + std::unique_ptr<FSSequentialFile>& target) + : target_(std::move(target)) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(n, io_opts, result, scratch, &dbg); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg); + } + + private: + std::unique_ptr<FSSequentialFile> target_; +}; + +class CompositeRandomAccessFileWrapper : public RandomAccessFile { + public: + explicit CompositeRandomAccessFileWrapper( + std::unique_ptr<FSRandomAccessFile>& target) + : target_(std::move(target)) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + IOOptions io_opts; + IODebugContext dbg; + std::vector<FSReadRequest> fs_reqs; + Status status; + + fs_reqs.resize(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].offset = reqs[i].offset; + fs_reqs[i].len = reqs[i].len; + fs_reqs[i].scratch = reqs[i].scratch; + fs_reqs[i].status = IOStatus::OK(); + } + status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg); + for (size_t i = 0; i < num_reqs; ++i) { + reqs[i].result = fs_reqs[i].result; + reqs[i].status = fs_reqs[i].status; + } + return status; + } + Status Prefetch(uint64_t offset, size_t n) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Prefetch(offset, n, io_opts, &dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((FSRandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + std::unique_ptr<FSRandomAccessFile> target_; +}; + +class CompositeWritableFileWrapper : public WritableFile { + public: + explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t) + : target_(std::move(t)) {} + + Status Append(const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, &dbg); + } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, verification_info, &dbg); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, &dbg); + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, verification_info, + &dbg); + } + Status Truncate(uint64_t size) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Truncate(size, io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->GetFileSize(io_opts, &dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->RangeSync(offset, nbytes, io_opts, &dbg); + } + + void PrepareWrite(size_t offset, size_t len) override { + IOOptions io_opts; + IODebugContext dbg; + target_->PrepareWrite(offset, len, io_opts, &dbg); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Allocate(offset, len, io_opts, &dbg); + } + + std::unique_ptr<FSWritableFile>* target() { return &target_; } + + private: + std::unique_ptr<FSWritableFile> target_; +}; + +class CompositeRandomRWFileWrapper : public RandomRWFile { + public: + explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Write(offset, data, io_opts, &dbg); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + + private: + std::unique_ptr<FSRandomRWFile> target_; +}; + +class CompositeDirectoryWrapper : public Directory { + public: + explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target) + : target_(std::move(target)) {} + + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions()); + } + + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr<FSDirectory> target_; +}; +} // namespace + +Status CompositeEnv::NewSequentialFile(const std::string& f, + std::unique_ptr<SequentialFile>* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr<FSSequentialFile> file; + Status status; + status = + file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeSequentialFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomAccessFile(const std::string& f, + std::unique_ptr<RandomAccessFile>* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr<FSRandomAccessFile> file; + Status status; + status = + file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeRandomAccessFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewWritableFile(const std::string& f, + std::unique_ptr<WritableFile>* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr<FSWritableFile> file; + Status status; + status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReopenWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr<FSWritableFile> file; + status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file, + &dbg); + if (status.ok()) { + result->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* r, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr<FSWritableFile> file; + status = file_system_->ReuseWritableFile(fname, old_fname, + FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomRWFile(const std::string& fname, + std::unique_ptr<RandomRWFile>* result, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr<FSRandomRWFile> file; + Status status; + status = + file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg); + if (status.ok()) { + result->reset(new CompositeRandomRWFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) { + IOOptions io_opts; + IODebugContext dbg; + std::unique_ptr<FSDirectory> dir; + Status status; + status = file_system_->NewDirectory(name, io_opts, &dir, &dbg); + if (status.ok()) { + result->reset(new CompositeDirectoryWrapper(dir)); + } + return status; +} + +namespace { +static std::unordered_map<std::string, OptionTypeInfo> env_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + OptionTypeInfo(0, OptionType::kUnknown, OptionVerificationType::kByName, + OptionTypeFlags::kDontSerialize) + .SetParseFunc([](const ConfigOptions& opts, + const std::string& /*name*/, const std::string& value, + void* addr) { + auto target = static_cast<EnvWrapper::Target*>(addr); + return Env::CreateFromString(opts, value, &(target->env), + &(target->guard)); + }) + .SetEqualsFunc([](const ConfigOptions& opts, + const std::string& /*name*/, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto target1 = static_cast<const EnvWrapper::Target*>(addr1); + const auto target2 = static_cast<const EnvWrapper::Target*>(addr2); + if (target1->env != nullptr) { + return target1->env->AreEquivalent(opts, target2->env, mismatch); + } else { + return (target2->env == nullptr); + } + }) + .SetPrepareFunc([](const ConfigOptions& opts, + const std::string& /*name*/, void* addr) { + auto target = static_cast<EnvWrapper::Target*>(addr); + if (target->guard.get() != nullptr) { + target->env = target->guard.get(); + } else if (target->env == nullptr) { + target->env = Env::Default(); + } + return target->env->PrepareOptions(opts); + }) + .SetValidateFunc([](const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts, + const std::string& /*name*/, const void* addr) { + const auto target = static_cast<const EnvWrapper::Target*>(addr); + if (target->env == nullptr) { + return Status::InvalidArgument("Target Env not specified"); + } else { + return target->env->ValidateOptions(db_opts, cf_opts); + } + })}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map<std::string, OptionTypeInfo> + composite_fs_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"file_system", + OptionTypeInfo::AsCustomSharedPtr<FileSystem>( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +static std::unordered_map<std::string, OptionTypeInfo> + composite_clock_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"clock", + OptionTypeInfo::AsCustomSharedPtr<SystemClock>( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +} // namespace + +std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs) { + return std::unique_ptr<Env>(new CompositeEnvWrapper(Env::Default(), fs)); +} + +CompositeEnvWrapper::CompositeEnvWrapper(Env* env, + const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& sc) + : CompositeEnv(fs, sc), target_(env) { + RegisterOptions("", &target_, &env_wrapper_type_info); + RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info); + RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info); +} + +CompositeEnvWrapper::CompositeEnvWrapper(const std::shared_ptr<Env>& env, + const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& sc) + : CompositeEnv(fs, sc), target_(env) { + RegisterOptions("", &target_, &env_wrapper_type_info); + RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info); + RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info); +} + +Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) { + target_.Prepare(); + if (file_system_ == nullptr) { + file_system_ = target_.env->GetFileSystem(); + } + if (system_clock_ == nullptr) { + system_clock_ = target_.env->GetSystemClock(); + } + return Env::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string CompositeEnvWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto options = CompositeEnv::SerializeOptions(config_options, header); + if (target_.env != nullptr && target_.env != Env::Default()) { + options.append("target="); + options.append(target_.env->ToString(config_options)); + } + return options; +} +#endif // ROCKSDB_LITE + +EnvWrapper::EnvWrapper(Env* t) : target_(t) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::EnvWrapper(std::unique_ptr<Env>&& t) : target_(std::move(t)) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::EnvWrapper(const std::shared_ptr<Env>& t) : target_(t) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::~EnvWrapper() {} + +Status EnvWrapper::PrepareOptions(const ConfigOptions& options) { + target_.Prepare(); + return Env::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const { + auto parent = Env::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_.env == nullptr || + target_.env == Env::Default()) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_.env->ToString(config_options)); + return result; + } +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/composite_env_wrapper.h b/src/rocksdb/env/composite_env_wrapper.h new file mode 100644 index 000000000..78da6f0ed --- /dev/null +++ b/src/rocksdb/env/composite_env_wrapper.h @@ -0,0 +1,380 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#undef LoadLibrary +#endif + +namespace ROCKSDB_NAMESPACE { + +class CompositeEnv : public Env { + public: + // Initialize a CompositeEnvWrapper that delegates all thread/time related + // calls to env, and all file operations to fs + explicit CompositeEnv(const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& clock) + : Env(fs, clock) {} + + Status RegisterDbPaths(const std::vector<std::string>& paths) override { + return file_system_->RegisterDbPaths(paths); + } + Status UnregisterDbPaths(const std::vector<std::string>& paths) override { + return file_system_->UnregisterDbPaths(paths); + } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + std::unique_ptr<SequentialFile>* r, + const EnvOptions& options) override; + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr<RandomAccessFile>* r, + const EnvOptions& options) override; + + Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r, + const EnvOptions& options) override; + + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) override; + + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* r, + const EnvOptions& options) override; + + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr<RandomRWFile>* result, + const EnvOptions& options) override; + + Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<MemoryMappedFileBuffer>* result) override { + return file_system_->NewMemoryMappedFileBuffer(fname, result); + } + + Status NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) override; + + Status FileExists(const std::string& f) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->FileExists(f, io_opts, &dbg); + } + Status GetChildren(const std::string& dir, + std::vector<std::string>* r) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetChildren(dir, io_opts, r, &dbg); + } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector<FileAttributes>* result) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetChildrenFileAttributes(dir, io_opts, result, &dbg); + } + Status DeleteFile(const std::string& f) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->DeleteFile(f, io_opts, &dbg); + } + Status Truncate(const std::string& fname, size_t size) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->Truncate(fname, size, io_opts, &dbg); + } + Status CreateDir(const std::string& d) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->CreateDir(d, io_opts, &dbg); + } + Status CreateDirIfMissing(const std::string& d) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->CreateDirIfMissing(d, io_opts, &dbg); + } + Status DeleteDir(const std::string& d) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->DeleteDir(d, io_opts, &dbg); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetFileSize(f, io_opts, s, &dbg); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetFileModificationTime(fname, io_opts, file_mtime, + &dbg); + } + + Status RenameFile(const std::string& s, const std::string& t) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->RenameFile(s, t, io_opts, &dbg); + } + + Status LinkFile(const std::string& s, const std::string& t) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->LinkFile(s, t, io_opts, &dbg); + } + + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->NumFileLinks(fname, io_opts, count, &dbg); + } + + Status AreFilesSame(const std::string& first, const std::string& second, + bool* res) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->AreFilesSame(first, second, io_opts, res, &dbg); + } + + Status LockFile(const std::string& f, FileLock** l) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->LockFile(f, io_opts, l, &dbg); + } + + Status UnlockFile(FileLock* l) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->UnlockFile(l, io_opts, &dbg); + } + + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetAbsolutePath(db_path, io_opts, output_path, &dbg); + } + + Status NewLogger(const std::string& fname, + std::shared_ptr<Logger>* result) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->NewLogger(fname, io_opts, result, &dbg); + } + + Status IsDirectory(const std::string& path, bool* is_dir) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->IsDirectory(path, io_opts, is_dir, &dbg); + } + + Status GetTestDirectory(std::string* path) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetTestDirectory(io_opts, path, &dbg); + } + + EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { + return file_system_->OptimizeForLogRead(FileOptions(env_options)); + } + + EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const override { + return file_system_->OptimizeForManifestRead(FileOptions(env_options)); + } + + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override { + return file_system_->OptimizeForLogWrite(FileOptions(env_options), + db_options); + } + + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override { + return file_system_->OptimizeForManifestWrite(FileOptions(env_options)); + } + + EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const override { + return file_system_->OptimizeForCompactionTableWrite( + FileOptions(env_options), immutable_ops); + } + EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForCompactionTableRead( + FileOptions(env_options), db_options); + } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForBlobFileRead(FileOptions(env_options), + db_options); + } + // This seems to clash with a macro on Windows, so #undef it here +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg); + } + uint64_t NowMicros() override { return system_clock_->NowMicros(); } + uint64_t NowNanos() override { return system_clock_->NowNanos(); } + + uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); } + + void SleepForMicroseconds(int micros) override { + system_clock_->SleepForMicroseconds(micros); + } + + Status GetCurrentTime(int64_t* unix_time) override { + return system_clock_->GetCurrentTime(unix_time); + } + std::string TimeToString(uint64_t time) override { + return system_clock_->TimeToString(time); + } +}; + +class CompositeEnvWrapper : public CompositeEnv { + public: + // Initialize a CompositeEnvWrapper that delegates all thread/time related + // calls to env, and all file operations to fs + explicit CompositeEnvWrapper(Env* env) + : CompositeEnvWrapper(env, env->GetFileSystem(), env->GetSystemClock()) {} + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} + + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<SystemClock>& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} + + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& sc); + + explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env, + const std::shared_ptr<FileSystem>& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} + + explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env, + const std::shared_ptr<SystemClock>& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} + + explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env, + const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& sc); + + static const char* kClassName() { return "CompositeEnv"; } + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return CompositeEnv::IsInstanceOf(name); + } + } + const Customizable* Inner() const override { return target_.env; } + + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE + + // Return the target to which this Env forwards all calls + Env* env_target() const { return target_.env; } + +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr<DynamicLibrary>* result) override { + return target_.env->LoadLibrary(lib_name, search_path, result); + } +#endif + + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return target_.env->Schedule(f, a, pri, tag, u); + } + + int UnSchedule(void* tag, Priority pri) override { + return target_.env->UnSchedule(tag, pri); + } + + void StartThread(void (*f)(void*), void* a) override { + return target_.env->StartThread(f, a); + } + void WaitForJoin() override { return target_.env->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return target_.env->GetThreadPoolQueueLen(pri); + } + + int ReserveThreads(int threads_to_be_reserved, Priority pri) override { + return target_.env->ReserveThreads(threads_to_be_reserved, pri); + } + + int ReleaseThreads(int threads_to_be_released, Priority pri) override { + return target_.env->ReleaseThreads(threads_to_be_released, pri); + } + + Status GetHostName(char* name, uint64_t len) override { + return target_.env->GetHostName(name, len); + } + void SetBackgroundThreads(int num, Priority pri) override { + return target_.env->SetBackgroundThreads(num, pri); + } + int GetBackgroundThreads(Priority pri) override { + return target_.env->GetBackgroundThreads(pri); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access); + } + + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return target_.env->IncBackgroundThreadsIfNeeded(num, pri); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + target_.env->LowerThreadPoolIOPriority(pool); + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + target_.env->LowerThreadPoolCPUPriority(pool); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return target_.env->LowerThreadPoolCPUPriority(pool, pri); + } + + Status GetThreadList(std::vector<ThreadStatus>* thread_list) override { + return target_.env->GetThreadList(thread_list); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_.env->GetThreadStatusUpdater(); + } + + uint64_t GetThreadID() const override { return target_.env->GetThreadID(); } + + std::string GenerateUniqueId() override { + return target_.env->GenerateUniqueId(); + } + + private: + EnvWrapper::Target target_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/emulated_clock.h b/src/rocksdb/env/emulated_clock.h new file mode 100644 index 000000000..622737635 --- /dev/null +++ b/src/rocksdb/env/emulated_clock.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <atomic> +#include <string> + +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +// A SystemClock that can "mock" sleep and counts its operations. +class EmulatedSystemClock : public SystemClockWrapper { + private: + // Something to return when mocking current time + const int64_t maybe_starting_time_; + std::atomic<int> sleep_counter_{0}; + std::atomic<int> cpu_counter_{0}; + std::atomic<int64_t> addon_microseconds_{0}; + // Do not modify in the env of a running DB (could cause deadlock) + std::atomic<bool> time_elapse_only_sleep_; + bool no_slowdown_; + + public: + explicit EmulatedSystemClock(const std::shared_ptr<SystemClock>& base, + bool time_elapse_only_sleep = false); + + static const char* kClassName() { return "TimeEmulatedSystemClock"; } + const char* Name() const override { return kClassName(); } + + virtual void SleepForMicroseconds(int micros) override { + sleep_counter_++; + if (no_slowdown_ || time_elapse_only_sleep_) { + addon_microseconds_.fetch_add(micros); + } + if (!no_slowdown_) { + SystemClockWrapper::SleepForMicroseconds(micros); + } + } + + void MockSleepForMicroseconds(int64_t micros) { + sleep_counter_++; + assert(no_slowdown_); + addon_microseconds_.fetch_add(micros); + } + + void MockSleepForSeconds(int64_t seconds) { + sleep_counter_++; + assert(no_slowdown_); + addon_microseconds_.fetch_add(seconds * 1000000); + } + + void SetTimeElapseOnlySleep(bool enabled) { + // We cannot set these before destroying the last DB because they might + // cause a deadlock or similar without the appropriate options set in + // the DB. + time_elapse_only_sleep_ = enabled; + no_slowdown_ = enabled; + } + + bool IsTimeElapseOnlySleep() const { return time_elapse_only_sleep_.load(); } + void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; } + bool IsMockSleepEnabled() const { return no_slowdown_; } + + int GetSleepCounter() const { return sleep_counter_.load(); } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s; + if (time_elapse_only_sleep_) { + *unix_time = maybe_starting_time_; + } else { + s = SystemClockWrapper::GetCurrentTime(unix_time); + } + if (s.ok()) { + // mock microseconds elapsed to seconds of time + *unix_time += addon_microseconds_.load() / 1000000; + } + return s; + } + + virtual uint64_t CPUNanos() override { + cpu_counter_++; + return SystemClockWrapper::CPUNanos(); + } + + virtual uint64_t CPUMicros() override { + cpu_counter_++; + return SystemClockWrapper::CPUMicros(); + } + + virtual uint64_t NowNanos() override { + return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) + + addon_microseconds_.load() * 1000; + } + + virtual uint64_t NowMicros() override { + return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) + + addon_microseconds_.load(); + } + + int GetCpuCounter() const { return cpu_counter_.load(); } + + void ResetCounters() { + cpu_counter_.store(0); + sleep_counter_.store(0); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/env.cc b/src/rocksdb/env/env.cc new file mode 100644 index 000000000..f70d1f067 --- /dev/null +++ b/src/rocksdb/env/env.cc @@ -0,0 +1,1264 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env.h" + +#include <thread> + +#include "env/composite_env_wrapper.h" +#include "env/emulated_clock.h" +#include "env/mock_env.h" +#include "env/unique_id_gen.h" +#include "logging/env_logger.h" +#include "memory/arena.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +#ifndef ROCKSDB_LITE +static int RegisterBuiltinEnvs(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory<Env>(MockEnv::kClassName(), [](const std::string& /*uri*/, + std::unique_ptr<Env>* guard, + std::string* /* errmsg */) { + guard->reset(MockEnv::Create(Env::Default())); + return guard->get(); + }); + library.AddFactory<Env>( + CompositeEnvWrapper::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<Env>* guard, + std::string* /* errmsg */) { + guard->reset(new CompositeEnvWrapper(Env::Default())); + return guard->get(); + }); + size_t num_types; + return static_cast<int>(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +static void RegisterSystemEnvs() { +#ifndef ROCKSDB_LITE + static std::once_flag loaded; + std::call_once(loaded, [&]() { + RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE +} + +class LegacySystemClock : public SystemClock { + private: + Env* env_; + + public: + explicit LegacySystemClock(Env* env) : env_(env) {} + const char* Name() const override { return "LegacySystemClock"; } + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + uint64_t NowMicros() override { return env_->NowMicros(); } + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + uint64_t NowNanos() override { return env_->NowNanos(); } + + uint64_t CPUMicros() override { return CPUNanos() / 1000; } + uint64_t CPUNanos() override { return env_->NowCPUNanos(); } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + void SleepForMicroseconds(int micros) override { + env_->SleepForMicroseconds(micros); + } + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + Status GetCurrentTime(int64_t* unix_time) override { + return env_->GetCurrentTime(unix_time); + } + // Converts seconds-since-Jan-01-1970 to a printable string + std::string TimeToString(uint64_t time) override { + return env_->TimeToString(time); + } + +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& /*config_options*/, + const std::string& /*prefix*/) const override { + // We do not want the LegacySystemClock to appear in the serialized output. + // This clock is an internal class for those who do not implement one and + // would be part of the Env. As such, do not serialize it here. + return ""; + } +#endif // ROCKSDB_LITE +}; + +class LegacySequentialFileWrapper : public FSSequentialFile { + public: + explicit LegacySequentialFileWrapper( + std::unique_ptr<SequentialFile>&& _target) + : target_(std::move(_target)) {} + + IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Read(n, result, scratch)); + } + IOStatus Skip(uint64_t n) override { + return status_to_io_status(target_->Skip(n)); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + IOStatus PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->PositionedRead(offset, n, result, scratch)); + } + + private: + std::unique_ptr<SequentialFile> target_; +}; + +class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { + public: + explicit LegacyRandomAccessFileWrapper( + std::unique_ptr<RandomAccessFile>&& target) + : target_(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + + IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + std::vector<ReadRequest> reqs; + Status status; + + reqs.reserve(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest req; + + req.offset = fs_reqs[i].offset; + req.len = fs_reqs[i].len; + req.scratch = fs_reqs[i].scratch; + req.status = Status::OK(); + + reqs.emplace_back(req); + } + status = target_->MultiRead(reqs.data(), num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].result = reqs[i].result; + fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status)); + } + return status_to_io_status(std::move(status)); + } + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Prefetch(offset, n)); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((RandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + private: + std::unique_ptr<RandomAccessFile> target_; +}; + +class LegacyRandomRWFileWrapper : public FSRandomRWFile { + public: + explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Write(offset, data)); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + + private: + std::unique_ptr<RandomRWFile> target_; +}; + +class LegacyWritableFileWrapper : public FSWritableFile { + public: + explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target) + : target_(std::move(_target)) {} + + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(size)); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return target_->GetFileSize(); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RangeSync(offset, nbytes)); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + target_->PrepareWrite(offset, len); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Allocate(offset, len)); + } + + private: + std::unique_ptr<WritableFile> target_; +}; + +class LegacyDirectoryWrapper : public FSDirectory { + public: + explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target) + : target_(std::move(target)) {} + + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr<Directory> target_; +}; + +class LegacyFileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} + ~LegacyFileSystemWrapper() override {} + + static const char* kClassName() { return "LegacyFileSystem"; } + const char* Name() const override { return kClassName(); } + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr<SequentialFile> file; + Status s = target_->NewSequentialFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacySequentialFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr<RandomAccessFile> file; + Status s = target_->NewRandomAccessFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyRandomAccessFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr<WritableFile> file; + Status s = target_->NewWritableFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr<WritableFile> file; + Status s = target_->ReopenWritableFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr<WritableFile> file; + Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr<RandomRWFile> file; + Status s = target_->NewRandomRWFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyRandomRWFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<MemoryMappedFileBuffer>* result) override { + return status_to_io_status( + target_->NewMemoryMappedFileBuffer(fname, result)); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/, + std::unique_ptr<FSDirectory>* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr<Directory> dir; + Status s = target_->NewDirectory(name, &dir); + if (s.ok()) { + result->reset(new LegacyDirectoryWrapper(std::move(dir))); + } + return status_to_io_status(std::move(s)); + } + IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->FileExists(f)); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/, + std::vector<std::string>* r, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildren(dir, r)); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& /*options*/, + std::vector<FileAttributes>* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildrenFileAttributes(dir, result)); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteFile(f)); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(fname, size)); + } + IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDir(d)); + } + IOStatus CreateDirIfMissing(const std::string& d, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDirIfMissing(d)); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteDir(d)); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFileSize(f, s)); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->GetFileModificationTime(fname, file_mtime)); + } + + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetAbsolutePath(db_path, output_path)); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RenameFile(s, t)); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LinkFile(s, t)); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/, + uint64_t* count, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NumFileLinks(fname, count)); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& /*options*/, bool* res, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->AreFilesSame(first, second, res)); + } + + IOStatus LockFile(const std::string& f, const IOOptions& /*options*/, + FileLock** l, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LockFile(f, l)); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->UnlockFile(l)); + } + + IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetTestDirectory(path)); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/, + std::shared_ptr<Logger>* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NewLogger(fname, result)); + } + + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeEnvOptions(opts); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/, + uint64_t* diskfree, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFreeSpace(path, diskfree)); + } + IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->IsDirectory(path, is_dir)); + } + +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& /*config_options*/, + const std::string& /*prefix*/) const override { + // We do not want the LegacyFileSystem to appear in the serialized output. + // This clock is an internal class for those who do not implement one and + // would be part of the Env. As such, do not serialize it here. + return ""; + } +#endif // ROCKSDB_LITE + private: + Env* target_; +}; +} // end anonymous namespace + +Env::Env() : thread_status_updater_(nullptr) { + file_system_ = std::make_shared<LegacyFileSystemWrapper>(this); + system_clock_ = std::make_shared<LegacySystemClock>(this); +} + +Env::Env(const std::shared_ptr<FileSystem>& fs) + : thread_status_updater_(nullptr), file_system_(fs) { + system_clock_ = std::make_shared<LegacySystemClock>(this); +} + +Env::Env(const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& clock) + : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {} + +Env::~Env() {} + +Status Env::NewLogger(const std::string& fname, + std::shared_ptr<Logger>* result) { + return NewEnvLogger(fname, this, result); +} + +Status Env::LoadEnv(const std::string& value, Env** result) { + return CreateFromString(ConfigOptions(), value, result); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result) { + Env* base = Env::Default(); + if (value.empty() || base->IsInstanceOf(value)) { + *result = base; + return Status::OK(); + } else { + RegisterSystemEnvs(); + Env* env = *result; + Status s = LoadStaticObject<Env>(config_options, value, nullptr, &env); + if (s.ok()) { + *result = env; + } + return s; + } +} + +Status Env::LoadEnv(const std::string& value, Env** result, + std::shared_ptr<Env>* guard) { + return CreateFromString(ConfigOptions(), value, result, guard); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr<Env>* guard) { + assert(result); + assert(guard != nullptr); + std::unique_ptr<Env> uniq; + + Env* env = *result; + std::string id; + std::unordered_map<std::string, std::string> opt_map; + + Status status = + Customizable::GetOptionsMap(config_options, env, value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + Env* base = Env::Default(); + if (id.empty() || base->IsInstanceOf(id)) { + env = base; + status = Status::OK(); + } else { + RegisterSystemEnvs(); +#ifndef ROCKSDB_LITE + // First, try to load the Env as a unique object. + status = config_options.registry->NewObject<Env>(id, &env, &uniq); +#else + status = + Status::NotSupported("Cannot load environment in LITE mode", value); +#endif + } + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, env, opt_map); + } + if (status.ok()) { + guard->reset(uniq.release()); + *result = env; + } + return status; +} + +Status Env::CreateFromUri(const ConfigOptions& config_options, + const std::string& env_uri, const std::string& fs_uri, + Env** result, std::shared_ptr<Env>* guard) { + *result = config_options.env; + if (env_uri.empty() && fs_uri.empty()) { + // Neither specified. Use the default + guard->reset(); + return Status::OK(); + } else if (!env_uri.empty() && !fs_uri.empty()) { + // Both specified. Cannot choose. Return Invalid + return Status::InvalidArgument("cannot specify both fs_uri and env_uri"); + } else if (fs_uri.empty()) { // Only have an ENV URI. Create an Env from it + return CreateFromString(config_options, env_uri, result, guard); + } else { + std::shared_ptr<FileSystem> fs; + Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs); + if (s.ok()) { + guard->reset(new CompositeEnvWrapper(*result, fs)); + *result = guard->get(); + } + return s; + } +} + +std::string Env::PriorityToString(Env::Priority priority) { + switch (priority) { + case Env::Priority::BOTTOM: + return "Bottom"; + case Env::Priority::LOW: + return "Low"; + case Env::Priority::HIGH: + return "High"; + case Env::Priority::USER: + return "User"; + case Env::Priority::TOTAL: + assert(false); + } + return "Invalid"; +} + +uint64_t Env::GetThreadID() const { + std::hash<std::thread::id> hasher; + return hasher(std::this_thread::get_id()); +} + +Status Env::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) { + Status s = RenameFile(old_fname, fname); + if (!s.ok()) { + return s; + } + return NewWritableFile(fname, result, options); +} + +Status Env::GetChildrenFileAttributes(const std::string& dir, + std::vector<FileAttributes>* result) { + assert(result != nullptr); + std::vector<std::string> child_fnames; + Status s = GetChildren(dir, &child_fnames); + if (!s.ok()) { + return s; + } + result->resize(child_fnames.size()); + size_t result_size = 0; + for (size_t i = 0; i < child_fnames.size(); ++i) { + const std::string path = dir + "/" + child_fnames[i]; + if (!(s = GetFileSize(path, &(*result)[result_size].size_bytes)).ok()) { + if (FileExists(path).IsNotFound()) { + // The file may have been deleted since we listed the directory + continue; + } + return s; + } + (*result)[result_size].name = std::move(child_fnames[i]); + result_size++; + } + result->resize(result_size); + return Status::OK(); +} + +Status Env::GetHostNameString(std::string* result) { + std::array<char, kMaxHostNameLen> hostname_buf{}; + Status s = GetHostName(hostname_buf.data(), hostname_buf.size()); + if (s.ok()) { + hostname_buf[hostname_buf.size() - 1] = '\0'; + result->assign(hostname_buf.data()); + } + return s; +} + +std::string Env::GenerateUniqueId() { + std::string result; + bool success = port::GenerateRfcUuid(&result); + if (!success) { + // Fall back on our own way of generating a unique ID and adapt it to + // RFC 4122 variant 1 version 4 (a random ID). + // https://en.wikipedia.org/wiki/Universally_unique_identifier + // We already tried GenerateRfcUuid so no need to try it again in + // GenerateRawUniqueId + constexpr bool exclude_port_uuid = true; + uint64_t upper, lower; + GenerateRawUniqueId(&upper, &lower, exclude_port_uuid); + + // Set 4-bit version to 4 + upper = (upper & (~uint64_t{0xf000})) | 0x4000; + // Set unary-encoded variant to 1 (0b10) + lower = (lower & (~(uint64_t{3} << 62))) | (uint64_t{2} << 62); + + // Use 36 character format of RFC 4122 + result.resize(36U); + char* buf = &result[0]; + PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, upper, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, lower >> 48, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 12, lower, /*!uppercase*/ false); + assert(buf == &result[36]); + + // Verify variant 1 version 4 + assert(result[14] == '4'); + assert(result[19] == '8' || result[19] == '9' || result[19] == 'a' || + result[19] == 'b'); + } + return result; +} + +SequentialFile::~SequentialFile() {} + +RandomAccessFile::~RandomAccessFile() {} + +WritableFile::~WritableFile() {} + +MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {} + +Logger::~Logger() {} + +Status Logger::Close() { + if (!closed_) { + closed_ = true; + return CloseImpl(); + } else { + return Status::OK(); + } +} + +Status Logger::CloseImpl() { return Status::NotSupported(); } + +FileLock::~FileLock() {} + +void LogFlush(Logger* info_log) { + if (info_log) { + info_log->Flush(); + } +} + +static void Logv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + } +} + +void Log(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Logv(info_log, format, ap); + va_end(ap); +} + +void Logger::Logv(const InfoLogLevel log_level, const char* format, + va_list ap) { + static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", "ERROR", + "FATAL"}; + if (log_level < log_level_) { + return; + } + + if (log_level == InfoLogLevel::INFO_LEVEL) { + // Doesn't print log level if it is INFO level. + // This is to avoid unexpected performance regression after we add + // the feature of log level. All the logs before we add the feature + // are INFO level. We don't want to add extra costs to those existing + // logging. + Logv(format, ap); + } else if (log_level == InfoLogLevel::HEADER_LEVEL) { + LogHeader(format, ap); + } else { + char new_format[500]; + snprintf(new_format, sizeof(new_format) - 1, "[%s] %s", + kInfoLogLevelNames[log_level], format); + Logv(new_format, ap); + } + + if (log_level >= InfoLogLevel::WARN_LEVEL && + log_level != InfoLogLevel::HEADER_LEVEL) { + // Log messages with severity of warning or higher should be rare and are + // sometimes followed by an unclean crash. We want to be sure important + // messages are not lost in an application buffer when that happens. + Flush(); + } +} + +static void Logv(const InfoLogLevel log_level, Logger* info_log, + const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= log_level) { + if (log_level == InfoLogLevel::HEADER_LEVEL) { + info_log->LogHeader(format, ap); + } else { + info_log->Logv(log_level, format, ap); + } + } +} + +void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, + ...) { + va_list ap; + va_start(ap, format); + Logv(log_level, info_log, format, ap); + va_end(ap); +} + +static void Headerv(Logger* info_log, const char* format, va_list ap) { + if (info_log) { + info_log->LogHeader(format, ap); + } +} + +void Header(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Headerv(info_log, format, ap); + va_end(ap); +} + +static void Debugv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { + info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap); + } +} + +void Debug(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Debugv(info_log, format, ap); + va_end(ap); +} + +static void Infov(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { + info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); + } +} + +void Info(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Infov(info_log, format, ap); + va_end(ap); +} + +static void Warnv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) { + info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap); + } +} + +void Warn(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Warnv(info_log, format, ap); + va_end(ap); +} + +static void Errorv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) { + info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap); + } +} + +void Error(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Errorv(info_log, format, ap); + va_end(ap); +} + +static void Fatalv(Logger* info_log, const char* format, va_list ap) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) { + info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap); + } +} + +void Fatal(Logger* info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Fatalv(info_log, format, ap); + va_end(ap); +} + +void LogFlush(const std::shared_ptr<Logger>& info_log) { + LogFlush(info_log.get()); +} + +void Log(const InfoLogLevel log_level, const std::shared_ptr<Logger>& info_log, + const char* format, ...) { + va_list ap; + va_start(ap, format); + Logv(log_level, info_log.get(), format, ap); + va_end(ap); +} + +void Header(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Headerv(info_log.get(), format, ap); + va_end(ap); +} + +void Debug(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Debugv(info_log.get(), format, ap); + va_end(ap); +} + +void Info(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Infov(info_log.get(), format, ap); + va_end(ap); +} + +void Warn(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Warnv(info_log.get(), format, ap); + va_end(ap); +} + +void Error(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Errorv(info_log.get(), format, ap); + va_end(ap); +} + +void Fatal(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Fatalv(info_log.get(), format, ap); + va_end(ap); +} + +void Log(const std::shared_ptr<Logger>& info_log, const char* format, ...) { + va_list ap; + va_start(ap, format); + Logv(info_log.get(), format, ap); + va_end(ap); +} + +Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, + bool should_sync) { + const auto& fs = env->GetFileSystem(); + return WriteStringToFile(fs.get(), data, fname, should_sync); +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + const auto& fs = env->GetFileSystem(); + return ReadFileToString(fs.get(), fname, data); +} + +namespace { // anonymous namespace + +void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { + env_options->use_mmap_reads = options.allow_mmap_reads; + env_options->use_mmap_writes = options.allow_mmap_writes; + env_options->use_direct_reads = options.use_direct_reads; + env_options->set_fd_cloexec = options.is_fd_close_on_exec; + env_options->bytes_per_sync = options.bytes_per_sync; + env_options->compaction_readahead_size = options.compaction_readahead_size; + env_options->random_access_max_buffer_size = + options.random_access_max_buffer_size; + env_options->rate_limiter = options.rate_limiter.get(); + env_options->writable_file_max_buffer_size = + options.writable_file_max_buffer_size; + env_options->allow_fallocate = options.allow_fallocate; + env_options->strict_bytes_per_sync = options.strict_bytes_per_sync; + options.env->SanitizeEnvOptions(env_options); +} + +} // namespace + +EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized_env_options.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions Env::OptimizeForLogRead(const EnvOptions& env_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = false; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForManifestRead(const EnvOptions& env_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = false; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForCompactionTableWrite( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_writes = + db_options.use_direct_io_for_flush_and_compaction; + return optimized_env_options; +} + +EnvOptions Env::OptimizeForCompactionTableRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} +EnvOptions Env::OptimizeForBlobFileRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} + +EnvOptions::EnvOptions(const DBOptions& options) { + AssignEnvOptions(this, options); +} + +EnvOptions::EnvOptions() { + DBOptions options; + AssignEnvOptions(this, options); +} + +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr<Logger>* result) { + FileOptions options; + // TODO: Tune the buffer size. + options.writable_file_max_buffer_size = 1024 * 1024; + std::unique_ptr<FSWritableFile> writable_file; + const auto status = env->GetFileSystem()->NewWritableFile( + fname, options, &writable_file, nullptr); + if (!status.ok()) { + return status; + } + + *result = std::make_shared<EnvLogger>(std::move(writable_file), fname, + options, env); + return Status::OK(); +} + +const std::shared_ptr<FileSystem>& Env::GetFileSystem() const { + return file_system_; +} + +const std::shared_ptr<SystemClock>& Env::GetSystemClock() const { + return system_clock_; +} +namespace { +static std::unordered_map<std::string, OptionTypeInfo> sc_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + OptionTypeInfo::AsCustomSharedPtr<SystemClock>( + 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, +#endif // ROCKSDB_LITE +}; + +} // namespace +SystemClockWrapper::SystemClockWrapper(const std::shared_ptr<SystemClock>& t) + : target_(t) { + RegisterOptions("", &target_, &sc_wrapper_type_info); +} + +Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) { + if (target_ == nullptr) { + target_ = SystemClock::Default(); + } + return SystemClock::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string SystemClockWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto parent = SystemClock::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_ == nullptr || + target_->IsInstanceOf(SystemClock::kDefaultName())) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_->ToString(config_options)); + return result; + } +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinSystemClocks(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory<SystemClock>( + EmulatedSystemClock::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard, + std::string* /* errmsg */) { + guard->reset(new EmulatedSystemClock(SystemClock::Default())); + return guard->get(); + }); + size_t num_types; + return static_cast<int>(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +Status SystemClock::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr<SystemClock>* result) { + auto clock = SystemClock::Default(); + if (clock->IsInstanceOf(value)) { + *result = clock; + return Status::OK(); + } else { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject<SystemClock>(config_options, value, nullptr, + result); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/env_basic_test.cc b/src/rocksdb/env/env_basic_test.cc new file mode 100644 index 000000000..0f18b3218 --- /dev/null +++ b/src/rocksdb/env/env_basic_test.cc @@ -0,0 +1,401 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include <algorithm> +#include <memory> +#include <string> +#include <vector> + +#include "env/mock_env.h" +#include "file/file_util.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/env_encryption.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +using CreateEnvFunc = Env*(); + +// These functions are used to create the various environments under which this +// test can execute. These functions are used to allow the test cases to be +// created without the Env being initialized, thereby eliminating a potential +// static initialization fiasco/race condition when attempting to get a +// custom/configured env prior to main being invoked. + +static Env* GetDefaultEnv() { return Env::Default(); } + +static Env* GetMockEnv() { + static std::unique_ptr<Env> mock_env(MockEnv::Create(Env::Default())); + return mock_env.get(); +} +#ifndef ROCKSDB_LITE +static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) { + ConfigOptions config_opts; + config_opts.invoke_prepare_options = false; + + std::shared_ptr<EncryptionProvider> provider; + EXPECT_OK(EncryptionProvider::CreateFromString(config_opts, provider_id, + &provider)); + return NewEncryptedEnv(base, provider); +} + +static Env* GetCtrEncryptedEnv() { + static std::unique_ptr<Env> ctr_encrypt_env( + NewTestEncryptedEnv(Env::Default(), "CTR://test")); + return ctr_encrypt_env.get(); +} + +static Env* GetMemoryEnv() { + static std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default())); + return mem_env.get(); +} + +static Env* GetTestEnv() { + static std::shared_ptr<Env> env_guard; + static Env* custom_env = nullptr; + if (custom_env == nullptr) { + const char* uri = getenv("TEST_ENV_URI"); + if (uri != nullptr) { + EXPECT_OK(Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env, + &env_guard)); + } + } + EXPECT_NE(custom_env, nullptr); + return custom_env; +} + +static Env* GetTestFS() { + static std::shared_ptr<Env> fs_env_guard; + static Env* fs_env = nullptr; + if (fs_env == nullptr) { + const char* uri = getenv("TEST_FS_URI"); + if (uri != nullptr) { + EXPECT_OK( + Env::CreateFromUri(ConfigOptions(), uri, "", &fs_env, &fs_env_guard)); + } + } + EXPECT_NE(fs_env, nullptr); + return fs_env; +} +#endif // ROCKSDB_LITE + +} // namespace +class EnvBasicTestWithParam + : public testing::Test, + public ::testing::WithParamInterface<CreateEnvFunc*> { + public: + Env* env_; + const EnvOptions soptions_; + std::string test_dir_; + + EnvBasicTestWithParam() : env_(GetParam()()) { + test_dir_ = test::PerThreadDBPath(env_, "env_basic_test"); + } + + void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); } + + void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); } +}; + +class EnvMoreTestWithParam : public EnvBasicTestWithParam {}; + +INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam, + ::testing::Values(&GetDefaultEnv)); +INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam, + ::testing::Values(&GetDefaultEnv)); + +INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, + ::testing::Values(&GetMockEnv)); + +#ifndef ROCKSDB_LITE +// next statements run env test against default encryption code. +INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam, + ::testing::Values(&GetCtrEncryptedEnv)); +INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam, + ::testing::Values(&GetCtrEncryptedEnv)); + +INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam, + ::testing::Values(&GetMemoryEnv)); + +namespace { + +// Returns a vector of 0 or 1 Env*, depending whether an Env is registered for +// TEST_ENV_URI. +// +// The purpose of returning an empty vector (instead of nullptr) is that gtest +// ValuesIn() will skip running tests when given an empty collection. +std::vector<CreateEnvFunc*> GetCustomEnvs() { + std::vector<CreateEnvFunc*> res; + const char* uri = getenv("TEST_ENV_URI"); + if (uri != nullptr) { + res.push_back(&GetTestEnv); + } + uri = getenv("TEST_FS_URI"); + if (uri != nullptr) { + res.push_back(&GetTestFS); + } + return res; +} + +} // anonymous namespace + +INSTANTIATE_TEST_CASE_P(CustomEnv, EnvBasicTestWithParam, + ::testing::ValuesIn(GetCustomEnvs())); + +INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam, + ::testing::ValuesIn(GetCustomEnvs())); +#endif // ROCKSDB_LITE + +TEST_P(EnvBasicTestWithParam, Basics) { + uint64_t file_size; + std::unique_ptr<WritableFile> writable_file; + std::vector<std::string> children; + + // Check that the directory is empty. + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/non_existent")); + ASSERT_TRUE(!env_->GetFileSize(test_dir_ + "/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + // Check that the file exists. + ASSERT_OK(env_->FileExists(test_dir_ + "/f")); + ASSERT_OK(env_->GetFileSize(test_dir_ + "/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + ASSERT_OK(env_->DeleteFile(test_dir_ + "/f")); + + // Write to the file. + ASSERT_OK( + env_->NewWritableFile(test_dir_ + "/f1", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + ASSERT_OK( + env_->NewWritableFile(test_dir_ + "/f2", &writable_file, soptions_)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize(test_dir_ + "/f1", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE( + !env_->RenameFile(test_dir_ + "/non_existent", test_dir_ + "/g").ok()); + ASSERT_OK(env_->RenameFile(test_dir_ + "/f1", test_dir_ + "/g")); + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/f1")); + ASSERT_OK(env_->FileExists(test_dir_ + "/g")); + ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming overwriting works + ASSERT_OK(env_->RenameFile(test_dir_ + "/f2", test_dir_ + "/g")); + ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size)); + ASSERT_EQ(0U, file_size); + + // Check that opening non-existent file fails. + std::unique_ptr<SequentialFile> seq_file; + std::unique_ptr<RandomAccessFile> rand_file; + ASSERT_TRUE(!env_->NewSequentialFile(test_dir_ + "/non_existent", &seq_file, + soptions_) + .ok()); + ASSERT_TRUE(!seq_file); + ASSERT_NOK(env_->NewRandomAccessFile(test_dir_ + "/non_existent", &rand_file, + soptions_)); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_NOK(env_->DeleteFile(test_dir_ + "/non_existent")); + ASSERT_OK(env_->DeleteFile(test_dir_ + "/g")); + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g")); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(0U, children.size()); + Status s = env_->GetChildren(test_dir_ + "/non_existent", &children); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(EnvBasicTestWithParam, ReadWrite) { + std::unique_ptr<WritableFile> writable_file; + std::unique_ptr<SequentialFile> seq_file; + std::unique_ptr<RandomAccessFile> rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile(test_dir_ + "/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST_P(EnvBasicTestWithParam, Misc) { + std::unique_ptr<WritableFile> writable_file; + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST_P(EnvBasicTestWithParam, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast<char>(i)); + } + + std::unique_ptr<WritableFile> writable_file; + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + std::unique_ptr<SequentialFile> seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete[] scratch; +} + +TEST_P(EnvMoreTestWithParam, GetModTime) { + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/dir1")); + uint64_t mtime1 = 0x0; + ASSERT_OK(env_->GetFileModificationTime(test_dir_ + "/dir1", &mtime1)); +} + +TEST_P(EnvMoreTestWithParam, MakeDir) { + ASSERT_OK(env_->CreateDir(test_dir_ + "/j")); + ASSERT_OK(env_->FileExists(test_dir_ + "/j")); + std::vector<std::string> children; + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(1U, children.size()); + // fail because file already exists + ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok()); + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/j")); + ASSERT_OK(env_->DeleteDir(test_dir_ + "/j")); + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/j")); +} + +TEST_P(EnvMoreTestWithParam, GetChildren) { + // empty folder returns empty vector + std::vector<std::string> children; + std::vector<Env::FileAttributes> childAttr; + ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_OK(env_->FileExists(test_dir_)); + ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); + ASSERT_EQ(0U, children.size()); + ASSERT_EQ(0U, childAttr.size()); + + // folder with contents returns relative path to test dir + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/niu")); + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/you")); + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/guo")); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); + ASSERT_EQ(3U, children.size()); + ASSERT_EQ(3U, childAttr.size()); + for (auto each : children) { + env_->DeleteDir(test_dir_ + "/" + each).PermitUncheckedError(); + } // necessary for default POSIX env + + // non-exist directory returns IOError + ASSERT_OK(env_->DeleteDir(test_dir_)); + ASSERT_NOK(env_->FileExists(test_dir_)); + ASSERT_NOK(env_->GetChildren(test_dir_, &children)); + ASSERT_NOK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); + + // if dir is a file, returns IOError + ASSERT_OK(env_->CreateDir(test_dir_)); + std::unique_ptr<WritableFile> writable_file; + ASSERT_OK( + env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + ASSERT_NOK(env_->GetChildren(test_dir_ + "/file", &children)); + ASSERT_EQ(0U, children.size()); +} + +TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) { + auto* env = Env::Default(); + ASSERT_OK(env->CreateDirIfMissing(test_dir_)); + + // Create a single file + std::string path = test_dir_; + const EnvOptions soptions; +#ifdef OS_WIN + path.append("\\test_file"); +#else + path.append("/test_file"); +#endif + std::string data("test data"); + std::unique_ptr<WritableFile> file; + ASSERT_OK(env->NewWritableFile(path, &file, soptions)); + ASSERT_OK(file->Append("test data")); + + // get the children + std::vector<std::string> result; + ASSERT_OK(env->GetChildren(test_dir_, &result)); + + // expect only one file named `test_data`, i.e. no `.` or `..` names + ASSERT_EQ(result.size(), 1); + ASSERT_EQ(result.at(0), "test_file"); +} + +} // namespace ROCKSDB_NAMESPACE +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/env/env_chroot.cc b/src/rocksdb/env/env_chroot.cc new file mode 100644 index 000000000..a64373517 --- /dev/null +++ b/src/rocksdb/env/env_chroot.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) + +#include "env/env_chroot.h" + +#include <errno.h> // errno +#include <stdlib.h> // realpath, free +#include <unistd.h> // geteuid + +#include "env/composite_env_wrapper.h" +#include "env/fs_remap.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" // errnoStr + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map<std::string, OptionTypeInfo> chroot_fs_type_info = { + {"chroot_dir", {0, OptionType::kString}}}; +} // namespace +ChrootFileSystem::ChrootFileSystem(const std::shared_ptr<FileSystem>& base, + const std::string& chroot_dir) + : RemapFileSystem(base), chroot_dir_(chroot_dir) { + RegisterOptions("chroot_dir", &chroot_dir_, &chroot_fs_type_info); +} + +Status ChrootFileSystem::PrepareOptions(const ConfigOptions& options) { + Status s = FileSystemWrapper::PrepareOptions(options); + if (!s.ok()) { + return s; + } else if (chroot_dir_.empty()) { + s = Status::InvalidArgument("ChRootFileSystem requires a chroot dir"); + } else { + s = target_->FileExists(chroot_dir_, IOOptions(), nullptr); + } + if (s.ok()) { +#if defined(OS_AIX) + char resolvedName[PATH_MAX]; + char* real_chroot_dir = realpath(chroot_dir_.c_str(), resolvedName); +#else + char* real_chroot_dir = realpath(chroot_dir_.c_str(), nullptr); +#endif + // chroot_dir must exist so realpath() returns non-nullptr. + assert(real_chroot_dir != nullptr); + chroot_dir_ = real_chroot_dir; +#if !defined(OS_AIX) + free(real_chroot_dir); +#endif + } + return s; +} + +IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options, + std::string* path, + IODebugContext* dbg) { + // Adapted from PosixEnv's implementation since it doesn't provide a way to + // create directory in the chroot. + char buf[256]; + snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast<int>(geteuid())); + *path = buf; + + // Directory may already exist, so ignore return + return CreateDirIfMissing(*path, options, dbg); +} + +// Returns status and expanded absolute path including the chroot directory. +// Checks whether the provided path breaks out of the chroot. If it returns +// non-OK status, the returned path should not be used. +std::pair<IOStatus, std::string> ChrootFileSystem::EncodePath( + const std::string& path) { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + std::pair<IOStatus, std::string> res; + res.second = chroot_dir_ + path; +#if defined(OS_AIX) + char resolvedName[PATH_MAX]; + char* normalized_path = realpath(res.second.c_str(), resolvedName); +#else + char* normalized_path = realpath(res.second.c_str(), nullptr); +#endif + if (normalized_path == nullptr) { + res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str()); + } else if (strlen(normalized_path) < chroot_dir_.size() || + strncmp(normalized_path, chroot_dir_.c_str(), + chroot_dir_.size()) != 0) { + res.first = IOStatus::IOError(res.second, + "Attempted to access path outside chroot"); + } else { + res.first = IOStatus::OK(); + } +#if !defined(OS_AIX) + free(normalized_path); +#endif + return res; +} + +// Similar to EncodePath() except assumes the basename in the path hasn't been +// created yet. +std::pair<IOStatus, std::string> ChrootFileSystem::EncodePathWithNewBasename( + const std::string& path) { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + // Basename may be followed by trailing slashes + size_t final_idx = path.find_last_not_of('/'); + if (final_idx == std::string::npos) { + // It's only slashes so no basename to extract + return EncodePath(path); + } + + // Pull off the basename temporarily since realname(3) (used by + // EncodePath()) requires a path that exists + size_t base_sep = path.rfind('/', final_idx); + auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1)); + status_and_enc_path.second.append(path.substr(base_sep + 1)); + return status_and_enc_path; +} + +std::shared_ptr<FileSystem> NewChrootFileSystem( + const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir) { + auto chroot_fs = std::make_shared<ChrootFileSystem>(base, chroot_dir); + Status s = chroot_fs->PrepareOptions(ConfigOptions()); + if (s.ok()) { + return chroot_fs; + } else { + return nullptr; + } +} + +Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) { + if (!base_env->FileExists(chroot_dir).ok()) { + return nullptr; + } + auto chroot_fs = NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir); + if (chroot_fs != nullptr) { + return new CompositeEnvWrapper(base_env, chroot_fs); + } else { + return nullptr; + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) diff --git a/src/rocksdb/env/env_chroot.h b/src/rocksdb/env/env_chroot.h new file mode 100644 index 000000000..9e5b9a1e9 --- /dev/null +++ b/src/rocksdb/env/env_chroot.h @@ -0,0 +1,55 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) + +#include <string> + +#include "env/fs_remap.h" +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { +class ChrootFileSystem : public RemapFileSystem { + public: + ChrootFileSystem(const std::shared_ptr<FileSystem>& base, + const std::string& chroot_dir); + + static const char* kClassName() { return "ChrootFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + Status PrepareOptions(const ConfigOptions& options) override; + + protected: + // Returns status and expanded absolute path including the chroot directory. + // Checks whether the provided path breaks out of the chroot. If it returns + // non-OK status, the returned path should not be used. + std::pair<IOStatus, std::string> EncodePath(const std::string& path) override; + + // Similar to EncodePath() except assumes the basename in the path hasn't been + // created yet. + std::pair<IOStatus, std::string> EncodePathWithNewBasename( + const std::string& path) override; + + private: + std::string chroot_dir_; +}; + +// Returns an Env that translates paths such that the root directory appears to +// be chroot_dir. chroot_dir should refer to an existing directory. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir); +std::shared_ptr<FileSystem> NewChrootFileSystem( + const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir); + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) diff --git a/src/rocksdb/env/env_encryption.cc b/src/rocksdb/env/env_encryption.cc new file mode 100644 index 000000000..c6b0a257d --- /dev/null +++ b/src/rocksdb/env/env_encryption.cc @@ -0,0 +1,1351 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/env_encryption.h" + +#include <algorithm> +#include <cassert> +#include <cctype> +#include <iostream> + +#include "env/composite_env_wrapper.h" +#include "env/env_encryption_ctr.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/convenience.h" +#include "rocksdb/io_status.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" +#include "util/random.h" +#include "util/string_util.h" + +#endif +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +std::shared_ptr<EncryptionProvider> EncryptionProvider::NewCTRProvider( + const std::shared_ptr<BlockCipher>& cipher) { + return std::make_shared<CTREncryptionProvider>(cipher); +} + +// Read up to "n" bytes from the file. "scratch[0..n-1]" may be +// written by this routine. Sets "*result" to the data that was +// read (including if fewer than "n" bytes were successfully read). +// May set "*result" to point at data in "scratch[0..n-1]", so +// "scratch[0..n-1]" must be live when "*result" is used. +// If an error was encountered, returns a non-OK status. +// +// REQUIRES: External synchronization +IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + assert(scratch); + IOStatus io_s = file_->Read(n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset_, (char*)result->data(), result->size())); + } + if (io_s.ok()) { + offset_ += result->size(); // We've already ready data from disk, so update + // offset_ even if decryption fails. + } + return io_s; +} + +// Skip "n" bytes from the file. This is guaranteed to be no +// slower that reading the same data, but may be faster. +// +// If end of file is reached, skipping will stop at the end of the +// file, and Skip will return OK. +// +// REQUIRES: External synchronization +IOStatus EncryptedSequentialFile::Skip(uint64_t n) { + auto status = file_->Skip(n); + if (!status.ok()) { + return status; + } + offset_ += n; + return status; +} + +// Indicates the upper layers if the current SequentialFile implementation +// uses direct IO. +bool EncryptedSequentialFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} + +// Positioned Read for direct I/O +// If Direct I/O enabled, offset, n, and scratch should be properly aligned +IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + assert(scratch); + offset += prefixLength_; // Skip prefix + auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + offset_ = offset + result->size(); + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return io_s; +} + +// Read up to "n" bytes from the file starting at "offset". +// "scratch[0..n-1]" may be written by this routine. Sets "*result" +// to the data that was read (including if fewer than "n" bytes were +// successfully read). May set "*result" to point at data in +// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when +// "*result" is used. If an error was encountered, returns a non-OK +// status. +// +// Safe for concurrent use by multiple threads. +// If Direct I/O enabled, offset, n, and scratch should be aligned properly. +IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + assert(scratch); + offset += prefixLength_; + auto io_s = file_->Read(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return io_s; +} + +// Readahead the file starting from offset by n bytes for caching. +IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + // return Status::OK(); + return file_->Prefetch(offset + prefixLength_, n, options, dbg); +} + +// Tries to get an unique ID for this file that will be the same each time +// the file is opened (and will stay the same while the file is open). +// Furthermore, it tries to make this ID at most "max_size" bytes. If such an +// ID can be created this function returns the length of the ID and places it +// in "id"; otherwise, this function returns 0, in which case "id" +// may not have been modified. +// +// This function guarantees, for IDs from a given environment, two unique ids +// cannot be made equal to each other by adding arbitrary bytes to one of +// them. That is, no unique ID is the prefix of another. +// +// This function guarantees that the returned ID will not be interpretable as +// a single varint. +// +// Note: these IDs are only valid for the duration of the process. +size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return file_->GetUniqueId(id, max_size); +}; + +void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { + file_->Hint(pattern); +} + +// Indicates the upper layers if the current RandomAccessFile implementation +// uses direct IO. +bool EncryptedRandomAccessFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +IOStatus EncryptedWritableFile::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToAppend(data); + if (data.size() > 0) { + auto offset = file_->GetFileSize(options, dbg); // size including prefix + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove + // so that the next two lines can be replaced with buf.Append(). + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); + } + if (!io_s.ok()) { + return io_s; + } + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); + } + return file_->Append(dataToAppend, options, dbg); +} + +IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data, + uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToAppend(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); + } + if (!io_s.ok()) { + return io_s; + } + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); + } + return file_->PositionedAppend(dataToAppend, offset, options, dbg); +} + +// Indicates the upper layers if the current WritableFile implementation +// uses direct IO. +bool EncryptedWritableFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// true if Sync() and Fsync() are safe to call concurrently with Append() +// and Flush(). +bool EncryptedWritableFile::IsSyncThreadSafe() const { + return file_->IsSyncThreadSafe(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +/* + * Get the size of valid data in the file. + */ +uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + return file_->GetFileSize(options, dbg) - prefixLength_; +} + +// Truncate is necessary to trim the file to the correct size +// before closing. It is not always possible to keep track of the file +// size due to whole pages writes. The behavior is undefined if called +// with other writes to follow. +IOStatus EncryptedWritableFile::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Truncate(size + prefixLength_, options, dbg); +} + +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +// This call has no effect on dirty pages in the cache. +IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} + +// Sync a file range with disk. +// offset is the starting byte of the file range to be synchronized. +// nbytes specifies the length of the range to be synchronized. +// This asks the OS to initiate flushing the cached data to disk, +// without waiting for completion. +// Default implementation does nothing. +IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) { + return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg); +} + +// PrepareWrite performs any necessary preparation for a write +// before the write actually occurs. This allows for pre-allocation +// of space on devices where it can result in less file +// fragmentation and/or less waste from over-zealous filesystem +// pre-allocation. +void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len, + const IOOptions& options, + IODebugContext* dbg) { + file_->PrepareWrite(offset + prefixLength_, len, options, dbg); +} + +void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) { + // the size here doesn't need to include prefixLength_, as it's a + // configuration will be use for `PrepareWrite()`. + file_->SetPreallocationBlockSize(size); +} + +void EncryptedWritableFile::GetPreallocationStatus( + size_t* block_size, size_t* last_allocated_block) { + file_->GetPreallocationStatus(block_size, last_allocated_block); +} + +// Pre-allocates space for a file. +IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Allocate(offset + prefixLength_, len, options, dbg); +} + +IOStatus EncryptedWritableFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} + +IOStatus EncryptedWritableFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} + +// A file abstraction for random reading and writing. + +// Indicates if the class makes use of direct I/O +// If false you must pass aligned buffer to Write() +bool EncryptedRandomRWFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} + +// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. +// Pass aligned buffer when use_direct_io() returns true. +IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToWrite(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); + } + if (!io_s.ok()) { + return io_s; + } + dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); + } + return file_->Write(offset, dataToWrite, options, dbg); +} + +// Read up to `n` bytes starting from offset `offset` and store them in +// result, provided `scratch` size should be at least `n`. +// Returns Status::OK() on success. +IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { + assert(scratch); + offset += prefixLength_; + auto status = file_->Read(offset, n, options, result, scratch, dbg); + if (!status.ok()) { + return status; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return status; +} + +IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Fsync(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} + +namespace { +static std::unordered_map<std::string, OptionTypeInfo> encrypted_fs_type_info = + { + {"provider", + OptionTypeInfo::AsCustomSharedPtr<EncryptionProvider>( + 0 /* No offset, whole struct*/, OptionVerificationType::kByName, + OptionTypeFlags::kNone)}, +}; +// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption +// to files stored on disk. +class EncryptedFileSystemImpl : public EncryptedFileSystem { + public: + const char* Name() const override { + return EncryptedFileSystem::kClassName(); + } + // Returns the raw encryption provider that should be used to write the input + // encrypted file. If there is no such provider, NotFound is returned. + IOStatus GetWritableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return IOStatus::OK(); + } else { + *result = nullptr; + return IOStatus::NotFound("No WriteProvider specified"); + } + } + + // Returns the raw encryption provider that should be used to read the input + // encrypted file. If there is no such provider, NotFound is returned. + IOStatus GetReadableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return IOStatus::OK(); + } else { + *result = nullptr; + return IOStatus::NotFound("No Provider specified"); + } + } + + // Creates a CipherStream for the underlying file/name using the options + // If a writable provider is found and encryption is enabled, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // should be encrypted + // @return OK on success, non-OK on failure. + template <class TypeFile> + IOStatus CreateWritableCipherStream( + const std::string& fname, const std::unique_ptr<TypeFile>& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + IOStatus status = GetWritableProvider(fname, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + status = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (status.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + status = underlying->Append(prefix, options.io_options, dbg); + } + if (!status.ok()) { + return status; + } + } + // Create cipher stream + status = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); + } + return status; + } + + template <class TypeFile> + IOStatus CreateWritableEncryptedFile(const std::string& fname, + std::unique_ptr<TypeFile>& underlying, + const FileOptions& options, + std::unique_ptr<TypeFile>* result, + IODebugContext* dbg) { + // Create cipher stream + std::unique_ptr<BlockAccessCipherStream> stream; + size_t prefix_length; + IOStatus status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + if (stream) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // Creates a CipherStream for the underlying file/name using the options + // If a writable provider is found and encryption is enabled, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // should be encrypted + // @return OK on success, non-OK on failure. + template <class TypeFile> + IOStatus CreateRandomWriteCipherStream( + const std::string& fname, const std::unique_ptr<TypeFile>& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + IOStatus io_s = GetWritableProvider(fname, &provider); + if (!io_s.ok()) { + return io_s; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + io_s = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (io_s.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + io_s = underlying->Write(0, prefix, options.io_options, dbg); + } + if (!io_s.ok()) { + return io_s; + } + } + // Create cipher stream + io_s = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); + } + return io_s; + } + + // Creates a CipherStream for the underlying file/name using the options + // If a readable provider is found and the file is encrypted, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // is encrypted + // @return OK on success, non-OK on failure. + template <class TypeFile> + IOStatus CreateSequentialCipherStream( + const std::string& fname, const std::unique_ptr<TypeFile>& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + IOStatus status = underlying->Read(*prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); + } + + // Creates a CipherStream for the underlying file/name using the options + // If a readable provider is found and the file is encrypted, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // is encrypted + // @return OK on success, non-OK on failure. + template <class TypeFile> + IOStatus CreateRandomReadCipherStream( + const std::string& fname, const std::unique_ptr<TypeFile>& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + IOStatus status = underlying->Read(0, *prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); + } + + public: + EncryptedFileSystemImpl(const std::shared_ptr<FileSystem>& base, + const std::shared_ptr<EncryptionProvider>& provider) + : EncryptedFileSystem(base) { + provider_ = provider; + RegisterOptions("EncryptionProvider", &provider_, &encrypted_fs_type_info); + } + + Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) override { + return provider_->AddCipher(descriptor, cipher, len, for_write); + } + + // NewSequentialFile opens a file for sequential reading. + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSSequentialFile>* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_reads) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr<FSSequentialFile> underlying; + auto status = + FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + uint64_t file_size; + status = FileSystemWrapper::GetFileSize(fname, options.io_options, + &file_size, dbg); + if (!status.ok()) { + return status; + } + if (!file_size) { + *result = std::move(underlying); + return status; + } + // Create cipher stream + std::unique_ptr<BlockAccessCipherStream> stream; + size_t prefix_length; + status = CreateSequentialCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + result->reset(new EncryptedSequentialFile( + std::move(underlying), std::move(stream), prefix_length)); + } + return status; + } + + // NewRandomAccessFile opens a file for random read access. + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_reads) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr<FSRandomAccessFile> underlying; + auto status = FileSystemWrapper::NewRandomAccessFile(fname, options, + &underlying, dbg); + if (!status.ok()) { + return status; + } + std::unique_ptr<BlockAccessCipherStream> stream; + size_t prefix_length; + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + if (stream) { + result->reset(new EncryptedRandomAccessFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // NewWritableFile opens a file for sequential writing. + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr<FSWritableFile> underlying; + IOStatus status = + FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + } + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr<FSWritableFile> underlying; + IOStatus status = + FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + } + + // Reuse an existing file by renaming it and opening it as writable. + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Open file using underlying Env implementation + std::unique_ptr<FSWritableFile> underlying; + auto status = FileSystemWrapper::ReuseWritableFile( + fname, old_fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + } + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* dbg) override { + result->reset(); + if (options.use_mmap_reads || options.use_mmap_writes) { + return IOStatus::InvalidArgument(); + } + // Check file exists + bool isNewFile = !FileExists(fname, options.io_options, dbg).ok(); + + // Open file using underlying Env implementation + std::unique_ptr<FSRandomRWFile> underlying; + auto status = + FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg); + if (!status.ok()) { + return status; + } + // Create cipher stream + std::unique_ptr<BlockAccessCipherStream> stream; + size_t prefix_length = 0; + if (!isNewFile) { + // File already exists, read prefix + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + } else { + status = CreateRandomWriteCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + } + if (status.ok()) { + if (stream) { + result->reset(new EncryptedRandomRWFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // Store in *result the attributes of the children of the specified + // directory. + // In case the implementation lists the directory prior to iterating the + // files + // and files are concurrently deleted, the deleted files will be omitted + // from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not + // have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector<FileAttributes>* result, + IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg); + if (!status.ok()) { + return status; + } + for (auto it = std::begin(*result); it != std::end(*result); ++it) { + // assert(it->size_bytes >= prefixLength); + // breaks env_basic_test when called on directory containing + // directories + // which makes subtraction of prefixLength worrisome since + // FileAttributes does not identify directories + EncryptionProvider* provider; + status = GetReadableProvider(it->name, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + it->size_bytes -= provider->GetPrefixLength(); + } + } + return IOStatus::OK(); + } + + // Store the size of fname in *file_size. + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetFileSize(fname, options, file_size, dbg); + if (!status.ok() || !(*file_size)) { + return status; + } + EncryptionProvider* provider; + status = GetReadableProvider(fname, &provider); + if (provider != nullptr && status.ok()) { + size_t prefixLength = provider->GetPrefixLength(); + assert(*file_size >= prefixLength); + *file_size -= prefixLength; + } + return status; + } + + private: + std::shared_ptr<EncryptionProvider> provider_; +}; +} // namespace + +Status NewEncryptedFileSystemImpl( + const std::shared_ptr<FileSystem>& base, + const std::shared_ptr<EncryptionProvider>& provider, + std::unique_ptr<FileSystem>* result) { + result->reset(new EncryptedFileSystemImpl(base, provider)); + return Status::OK(); +} + +std::shared_ptr<FileSystem> NewEncryptedFS( + const std::shared_ptr<FileSystem>& base, + const std::shared_ptr<EncryptionProvider>& provider) { + std::unique_ptr<FileSystem> efs; + Status s = NewEncryptedFileSystemImpl(base, provider, &efs); + if (s.ok()) { + s = efs->PrepareOptions(ConfigOptions()); + } + if (s.ok()) { + std::shared_ptr<FileSystem> result(efs.release()); + return result; + } else { + return nullptr; + } +} +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr<EncryptionProvider>& provider) { + return new CompositeEnvWrapper( + base_env, NewEncryptedFS(base_env->GetFileSystem(), provider)); +} + +// Encrypt one or more (partial) blocks of data at the file offset. +// Length of data is given in dataSize. +Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, + size_t dataSize) { + // Calculate block index + auto blockSize = BlockSize(); + uint64_t blockIndex = fileOffset / blockSize; + size_t blockOffset = fileOffset % blockSize; + std::unique_ptr<char[]> blockBuffer; + + std::string scratch; + AllocateScratch(scratch); + + // Encrypt individual blocks. + while (1) { + char* block = data; + size_t n = std::min(dataSize, blockSize - blockOffset); + if (n != blockSize) { + // We're not encrypting a full block. + // Copy data to blockBuffer + if (!blockBuffer.get()) { + // Allocate buffer + blockBuffer = std::unique_ptr<char[]>(new char[blockSize]); + } + block = blockBuffer.get(); + // Copy plain data to block buffer + memmove(block + blockOffset, data, n); + } + auto status = EncryptBlock(blockIndex, block, (char*)scratch.data()); + if (!status.ok()) { + return status; + } + if (block != data) { + // Copy encrypted data back to `data`. + memmove(data, block + blockOffset, n); + } + dataSize -= n; + if (dataSize == 0) { + return Status::OK(); + } + data += n; + blockOffset = 0; + blockIndex++; + } +} + +// Decrypt one or more (partial) blocks of data at the file offset. +// Length of data is given in dataSize. +Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, + size_t dataSize) { + // Calculate block index + auto blockSize = BlockSize(); + uint64_t blockIndex = fileOffset / blockSize; + size_t blockOffset = fileOffset % blockSize; + std::unique_ptr<char[]> blockBuffer; + + std::string scratch; + AllocateScratch(scratch); + + // Decrypt individual blocks. + while (1) { + char* block = data; + size_t n = std::min(dataSize, blockSize - blockOffset); + if (n != blockSize) { + // We're not decrypting a full block. + // Copy data to blockBuffer + if (!blockBuffer.get()) { + // Allocate buffer + blockBuffer = std::unique_ptr<char[]>(new char[blockSize]); + } + block = blockBuffer.get(); + // Copy encrypted data to block buffer + memmove(block + blockOffset, data, n); + } + auto status = DecryptBlock(blockIndex, block, (char*)scratch.data()); + if (!status.ok()) { + return status; + } + if (block != data) { + // Copy decrypted data back to `data`. + memmove(data, block + blockOffset, n); + } + + // Simply decrementing dataSize by n could cause it to underflow, + // which will very likely make it read over the original bounds later + assert(dataSize >= n); + if (dataSize < n) { + return Status::Corruption("Cannot decrypt data at given offset"); + } + + dataSize -= n; + if (dataSize == 0) { + return Status::OK(); + } + data += n; + blockOffset = 0; + blockIndex++; + } +} + +namespace { +static std::unordered_map<std::string, OptionTypeInfo> + rot13_block_cipher_type_info = { + {"block_size", + {0 /* No offset, whole struct*/, OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; +// Implements a BlockCipher using ROT13. +// +// Note: This is a sample implementation of BlockCipher, +// it is NOT considered safe and should NOT be used in production. +class ROT13BlockCipher : public BlockCipher { + private: + size_t blockSize_; + + public: + explicit ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) { + RegisterOptions("ROT13BlockCipherOptions", &blockSize_, + &rot13_block_cipher_type_info); + } + + static const char* kClassName() { return "ROT13"; } + const char* Name() const override { return kClassName(); } + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return blockSize_; } + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + Status Encrypt(char* data) override { + for (size_t i = 0; i < blockSize_; ++i) { + data[i] += 13; + } + return Status::OK(); + } + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + Status Decrypt(char* data) override { return Encrypt(data); } +}; +static const std::unordered_map<std::string, OptionTypeInfo> + ctr_encryption_provider_type_info = { + {"cipher", + OptionTypeInfo::AsCustomSharedPtr<BlockCipher>( + 0 /* No offset, whole struct*/, OptionVerificationType::kByName, + OptionTypeFlags::kNone)}, +}; +} // anonymous namespace + +// Allocate scratch space which is passed to EncryptBlock/DecryptBlock. +void CTRCipherStream::AllocateScratch(std::string& scratch) { + auto blockSize = cipher_->BlockSize(); + scratch.reserve(blockSize); +} + +// Encrypt a block of data at the given block index. +// Length of data is equal to BlockSize(); +Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) { + // Create nonce + counter + auto blockSize = cipher_->BlockSize(); + memmove(scratch, iv_.data(), blockSize); + EncodeFixed64(scratch, blockIndex + initialCounter_); + + // Encrypt nonce+counter + auto status = cipher_->Encrypt(scratch); + if (!status.ok()) { + return status; + } + + // XOR data with ciphertext. + for (size_t i = 0; i < blockSize; i++) { + data[i] = data[i] ^ scratch[i]; + } + return Status::OK(); +} + +// Decrypt a block of data at the given block index. +// Length of data is equal to BlockSize(); +Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) { + // For CTR decryption & encryption are the same + return EncryptBlock(blockIndex, data, scratch); +} + +CTREncryptionProvider::CTREncryptionProvider( + const std::shared_ptr<BlockCipher>& c) + : cipher_(c) { + RegisterOptions("Cipher", &cipher_, &ctr_encryption_provider_type_info); +} + +bool CTREncryptionProvider::IsInstanceOf(const std::string& name) const { + // Special case for test purposes. + if (name == "1://test" && cipher_ != nullptr) { + return cipher_->IsInstanceOf(ROT13BlockCipher::kClassName()); + } else { + return EncryptionProvider::IsInstanceOf(name); + } +} + +// GetPrefixLength returns the length of the prefix that is added to every file +// and used for storing encryption options. +// For optimal performance, the prefix length should be a multiple of +// the page size. +size_t CTREncryptionProvider::GetPrefixLength() const { + return defaultPrefixLength; +} + +Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/, + const char* cipher, size_t len, + bool /*for_write*/) { + if (cipher_) { + return Status::NotSupported("Cannot add keys to CTREncryptionProvider"); + } else if (strcmp(ROT13BlockCipher::kClassName(), cipher) == 0) { + cipher_.reset(new ROT13BlockCipher(len)); + return Status::OK(); + } else { + return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher), + &cipher_); + } +} + +// decodeCTRParameters decodes the initial counter & IV from the given +// (plain text) prefix. +static void decodeCTRParameters(const char* prefix, size_t blockSize, + uint64_t& initialCounter, Slice& iv) { + // First block contains 64-bit initial counter + initialCounter = DecodeFixed64(prefix); + // Second block contains IV + iv = Slice(prefix + blockSize, blockSize); +} + +// CreateNewPrefix initialized an allocated block of prefix memory +// for a new file. +Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, + char* prefix, + size_t prefixLength) const { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } + // Create & seed rnd. + Random rnd((uint32_t)SystemClock::Default()->NowMicros()); + // Fill entire prefix block with random values. + for (size_t i = 0; i < prefixLength; i++) { + prefix[i] = rnd.Uniform(256) & 0xFF; + } + // Take random data to extract initial counter & IV + auto blockSize = cipher_->BlockSize(); + uint64_t initialCounter; + Slice prefixIV; + decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV); + + // Now populate the rest of the prefix, starting from the third block. + PopulateSecretPrefixPart(prefix + (2 * blockSize), + prefixLength - (2 * blockSize), blockSize); + + // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial + // counter & IV unencrypted) + CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter); + Status status; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = cipherStream.Encrypt(0, prefix + (2 * blockSize), + prefixLength - (2 * blockSize)); + } + if (!status.ok()) { + return status; + } + return Status::OK(); +} + +// PopulateSecretPrefixPart initializes the data into a new prefix block +// in plain text. +// Returns the amount of space (starting from the start of the prefix) +// that has been initialized. +size_t CTREncryptionProvider::PopulateSecretPrefixPart( + char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const { + // Nothing to do here, put in custom data in override when needed. + return 0; +} + +Status CTREncryptionProvider::CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr<BlockAccessCipherStream>* result) { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } + // Read plain text part of prefix. + auto blockSize = cipher_->BlockSize(); + uint64_t initialCounter; + Slice iv; + decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv); + + // If the prefix is smaller than twice the block size, we would below read a + // very large chunk of the file (and very likely read over the bounds) + assert(prefix.size() >= 2 * blockSize); + if (prefix.size() < 2 * blockSize) { + return Status::Corruption("Unable to read from file " + fname + + ": read attempt would read beyond file bounds"); + } + + // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 + // with initial counter & IV are unencrypted) + CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter); + Status status; + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), + prefix.size() - (2 * blockSize)); + } + if (!status.ok()) { + return status; + } + + // Create cipher stream + return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, + prefix, result); +} + +// CreateCipherStreamFromPrefix creates a block access cipher stream for a file +// given given name and options. The given prefix is already decrypted. +Status CTREncryptionProvider::CreateCipherStreamFromPrefix( + const std::string& /*fname*/, const EnvOptions& /*options*/, + uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/, + std::unique_ptr<BlockAccessCipherStream>* result) { + (*result) = std::unique_ptr<BlockAccessCipherStream>( + new CTRCipherStream(cipher_, iv.data(), initialCounter)); + return Status::OK(); +} + +namespace { +static void RegisterEncryptionBuiltins() { + static std::once_flag once; + std::call_once(once, [&]() { + auto lib = ObjectRegistry::Default()->AddLibrary("encryption"); + // Match "CTR" or "CTR://test" + lib->AddFactory<EncryptionProvider>( + ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true) + .AddSuffix("://test"), + [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard, + std::string* /*errmsg*/) { + if (EndsWith(uri, "://test")) { + std::shared_ptr<BlockCipher> cipher = + std::make_shared<ROT13BlockCipher>(32); + guard->reset(new CTREncryptionProvider(cipher)); + } else { + guard->reset(new CTREncryptionProvider()); + } + return guard->get(); + }); + + lib->AddFactory<EncryptionProvider>( + "1://test", [](const std::string& /*uri*/, + std::unique_ptr<EncryptionProvider>* guard, + std::string* /*errmsg*/) { + std::shared_ptr<BlockCipher> cipher = + std::make_shared<ROT13BlockCipher>(32); + guard->reset(new CTREncryptionProvider(cipher)); + return guard->get(); + }); + + // Match "ROT13" or "ROT13:[0-9]+" + lib->AddFactory<BlockCipher>( + ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr<BlockCipher>* guard, + std::string* /* errmsg */) { + size_t colon = uri.find(':'); + if (colon != std::string::npos) { + size_t block_size = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new ROT13BlockCipher(block_size)); + } else { + guard->reset(new ROT13BlockCipher(32)); + } + + return guard->get(); + }); + }); +} +} // namespace + +Status BlockCipher::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr<BlockCipher>* result) { + RegisterEncryptionBuiltins(); + return LoadSharedObject<BlockCipher>(config_options, value, nullptr, result); +} + +Status EncryptionProvider::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr<EncryptionProvider>* result) { + RegisterEncryptionBuiltins(); + return LoadSharedObject<EncryptionProvider>(config_options, value, nullptr, + result); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/env_encryption_ctr.h b/src/rocksdb/env/env_encryption_ctr.h new file mode 100644 index 000000000..cfb440c72 --- /dev/null +++ b/src/rocksdb/env/env_encryption_ctr.h @@ -0,0 +1,116 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(ROCKSDB_LITE) + +#include "rocksdb/env_encryption.h" + +namespace ROCKSDB_NAMESPACE { +// CTRCipherStream implements BlockAccessCipherStream using an +// Counter operations mode. +// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation +// +// Note: This is a possible implementation of BlockAccessCipherStream, +// it is considered suitable for use. +class CTRCipherStream final : public BlockAccessCipherStream { + private: + std::shared_ptr<BlockCipher> cipher_; + std::string iv_; + uint64_t initialCounter_; + + public: + CTRCipherStream(const std::shared_ptr<BlockCipher>& c, const char* iv, + uint64_t initialCounter) + : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){}; + virtual ~CTRCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return cipher_->BlockSize(); } + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + void AllocateScratch(std::string&) override; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override; +}; + +// This encryption provider uses a CTR cipher stream, with a given block cipher +// and IV. +// +// Note: This is a possible implementation of EncryptionProvider, +// it is considered suitable for use, provided a safe BlockCipher is used. +class CTREncryptionProvider : public EncryptionProvider { + private: + std::shared_ptr<BlockCipher> cipher_; + + protected: + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. This size is to ensure the first real data byte + // is placed at largest known alignment point for direct io. + const static size_t defaultPrefixLength = 4096; + + public: + explicit CTREncryptionProvider( + const std::shared_ptr<BlockCipher>& c = nullptr); + virtual ~CTREncryptionProvider() {} + + static const char* kClassName() { return "CTR"; } + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override; + // GetPrefixLength returns the length of the prefix that is added to every + // file + // and used for storing encryption options. + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. + size_t GetPrefixLength() const override; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) const override; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr<BlockAccessCipherStream>* result) override; + + Status AddCipher(const std::string& descriptor, const char* /*cipher*/, + size_t /*len*/, bool /*for_write*/) override; + + protected: + // PopulateSecretPrefixPart initializes the data into a new prefix block + // that will be encrypted. This function will store the data in plain text. + // It will be encrypted later (before written to disk). + // Returns the amount of space (starting from the start of the prefix) + // that has been initialized. + virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, + size_t blockSize) const; + + // CreateCipherStreamFromPrefix creates a block access cipher stream for a + // file given + // given name and options. The given prefix is already decrypted. + virtual Status CreateCipherStreamFromPrefix( + const std::string& fname, const EnvOptions& options, + uint64_t initialCounter, const Slice& iv, const Slice& prefix, + std::unique_ptr<BlockAccessCipherStream>* result); +}; + +Status NewEncryptedFileSystemImpl( + const std::shared_ptr<FileSystem>& base_fs, + const std::shared_ptr<EncryptionProvider>& provider, + std::unique_ptr<FileSystem>* fs); + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) diff --git a/src/rocksdb/env/env_posix.cc b/src/rocksdb/env/env_posix.cc new file mode 100644 index 000000000..77f28e1f5 --- /dev/null +++ b/src/rocksdb/env/env_posix.cc @@ -0,0 +1,520 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + +#include "port/lang.h" +#if !defined(OS_WIN) + +#include <dirent.h> +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include <dlfcn.h> +#endif +#include <errno.h> +#include <fcntl.h> + +#if defined(ROCKSDB_IOURING_PRESENT) +#include <liburing.h> +#endif +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) +#include <sys/statfs.h> +#endif +#include <sys/statvfs.h> +#include <sys/time.h> +#include <sys/types.h> +#if defined(ROCKSDB_IOURING_PRESENT) +#include <sys/uio.h> +#endif +#include <time.h> +#include <unistd.h> + +#include <algorithm> +// Get nano time includes +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) +#elif defined(__MACH__) +#include <Availability.h> +#include <mach/clock.h> +#include <mach/mach.h> +#else +#include <chrono> +#endif +#include <deque> +#include <set> +#include <vector> + +#include "env/composite_env_wrapper.h" +#include "env/io_posix.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "port/port.h" +#include "port/sys_time.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/thread_local.h" +#include "util/threadpool_imp.h" + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +namespace ROCKSDB_NAMESPACE { +#if defined(OS_WIN) +static const std::string kSharedLibExt = ".dll"; +static const char kPathSeparator = ';'; +#else +static const char kPathSeparator = ':'; +#if defined(OS_MACOSX) +static const std::string kSharedLibExt = ".dylib"; +#else +static const std::string kSharedLibExt = ".so"; +#endif +#endif + +namespace { + +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +class PosixDynamicLibrary : public DynamicLibrary { + public: + PosixDynamicLibrary(const std::string& name, void* handle) + : name_(name), handle_(handle) {} + ~PosixDynamicLibrary() override { dlclose(handle_); } + + Status LoadSymbol(const std::string& sym_name, void** func) override { + assert(nullptr != func); + dlerror(); // Clear any old error + *func = dlsym(handle_, sym_name.c_str()); + if (*func != nullptr) { + return Status::OK(); + } else { + char* err = dlerror(); + return Status::NotFound("Error finding symbol: " + sym_name, err); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + void* handle_; +}; +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + +class PosixClock : public SystemClock { + public: + static const char* kClassName() { return "PosixClock"; } + const char* Name() const override { return kDefaultName(); } + const char* NickName() const override { return kClassName(); } + + uint64_t NowMicros() override { + port::TimeVal tv; + port::GetTimeOfDay(&tv, nullptr); + return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + uint64_t NowNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(OS_SOLARIS) + return gethrtime(); +#elif defined(__MACH__) + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); + return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#else + return std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif + } + + uint64_t CPUMicros() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000; +#endif + return 0; + } + + uint64_t CPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + + void SleepForMicroseconds(int micros) override { usleep(micros); } + + Status GetCurrentTime(int64_t* unix_time) override { + time_t ret = time(nullptr); + if (ret == (time_t)-1) { + return IOError("GetCurrentTime", "", errno); + } + *unix_time = (int64_t)ret; + return Status::OK(); + } + + std::string TimeToString(uint64_t secondsSince1970) override { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + port::LocalTimeR(&seconds, &t); + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + return dummy; + } +}; + +class PosixEnv : public CompositeEnv { + public: + static const char* kClassName() { return "PosixEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + ~PosixEnv() override { + if (this == Env::Default()) { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].JoinAllThreads(); + } + // Do not delete the thread_status_updater_ in order to avoid the + // free after use when Env::Default() is destructed while some other + // child threads are still trying to update thread status. All + // PosixEnv instances use the same thread_status_updater_, so never + // explicitly delete it. + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION + // Loads the named library into the result. + // If the input name is empty, the current executable is loaded + // On *nix systems, a "lib" prefix is added to the name if one is not supplied + // Comparably, the appropriate shared library extension is added to the name + // if not supplied. If search_path is not specified, the shared library will + // be loaded using the default path (LD_LIBRARY_PATH) If search_path is + // specified, the shared library will be searched for in the directories + // provided by the search path + Status LoadLibrary(const std::string& name, const std::string& path, + std::shared_ptr<DynamicLibrary>* result) override { + assert(result != nullptr); + if (name.empty()) { + void* hndl = dlopen(NULL, RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(name, hndl)); + return Status::OK(); + } + } else { + std::string library_name = name; + if (library_name.find(kSharedLibExt) == std::string::npos) { + library_name = library_name + kSharedLibExt; + } +#if !defined(OS_WIN) + if (library_name.find('/') == std::string::npos && + library_name.compare(0, 3, "lib") != 0) { + library_name = "lib" + library_name; + } +#endif + if (path.empty()) { + void* hndl = dlopen(library_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(library_name, hndl)); + return Status::OK(); + } + } else { + std::string local_path; + std::stringstream ss(path); + while (getline(ss, local_path, kPathSeparator)) { + if (!path.empty()) { + std::string full_name = local_path + "/" + library_name; + void* hndl = dlopen(full_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(full_name, hndl)); + return Status::OK(); + } + } + } + } + } + return Status::IOError( + IOErrorMsg("Failed to open shared library: xs", name), dlerror()); + } +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + + void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, + void* tag = nullptr, + void (*unschedFunction)(void* arg) = nullptr) override; + + int UnSchedule(void* arg, Priority pri) override; + + void StartThread(void (*function)(void* arg), void* arg) override; + + void WaitForJoin() override; + + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; + + int ReserveThreads(int threads_to_be_reserved, Priority pri) override; + + int ReleaseThreads(int threads_to_be_released, Priority pri) override; + + Status GetThreadList(std::vector<ThreadStatus>* thread_list) override { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); + } + + uint64_t GetThreadID() const override { + uint64_t thread_id = 0; +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 30) + thread_id = ::gettid(); +#else // __GLIBC_PREREQ(2, 30) + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // __GLIBC_PREREQ(2, 30) +#else // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + return thread_id; + } + + Status GetHostName(char* name, uint64_t len) override { + int ret = gethostname(name, static_cast<size_t>(len)); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) { + return Status::InvalidArgument(errnoStr(errno).c_str()); + } else { + return IOError("GetHostName", name, errno); + } + } + return Status::OK(); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return Env::GetThreadStatusUpdater(); + } + + std::string GenerateUniqueId() override { return Env::GenerateUniqueId(); } + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); + } + + int GetBackgroundThreads(Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + allow_non_owner_access_ = allow_non_owner_access; + return Status::OK(); + } + + // Allow increasing the number of worker threads. + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); +#ifdef OS_LINUX + thread_pools_[pool].LowerIOPriority(); +#else + (void)pool; +#endif + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(pri); + return Status::OK(); + } + + private: + friend Env* Env::Default(); + // Constructs the default Env, a singleton + PosixEnv(); + + // The below 4 members are only used by the default PosixEnv instance. + // Non-default instances simply maintain references to the backing + // members in te default instance + std::vector<ThreadPoolImpl> thread_pools_storage_; + pthread_mutex_t mu_storage_; + std::vector<pthread_t> threads_to_join_storage_; + bool allow_non_owner_access_storage_; + + std::vector<ThreadPoolImpl>& thread_pools_; + pthread_mutex_t& mu_; + std::vector<pthread_t>& threads_to_join_; + // If true, allow non owner read access for db files. Otherwise, non-owner + // has no access to db files. + bool& allow_non_owner_access_; +}; + +PosixEnv::PosixEnv() + : CompositeEnv(FileSystem::Default(), SystemClock::Default()), + thread_pools_storage_(Priority::TOTAL), + allow_non_owner_access_storage_(true), + thread_pools_(thread_pools_storage_), + mu_(mu_storage_), + threads_to_join_(threads_to_join_storage_), + allow_non_owner_access_(allow_non_owner_access_storage_) { + ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast<Env::Priority>(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(this); + } + thread_status_updater_ = CreateThreadStatusUpdater(); +} + +void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +int PosixEnv::UnSchedule(void* arg, Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} + +unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +int PosixEnv::ReserveThreads(int threads_to_reserved, Priority pri) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].ReserveThreads(threads_to_reserved); +} + +int PosixEnv::ReleaseThreads(int threads_to_released, Priority pri) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].ReleaseThreads(threads_to_released); +} + +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; + +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast<StartThreadState*>(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + ThreadPoolImpl::PthreadCall( + "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state)); + ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::WaitForJoin() { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + threads_to_join_.clear(); +} + +} // namespace + +// +// Default Posix Env +// +Env* Env::Default() { + // The following function call initializes the singletons of ThreadLocalPtr + // right before the static default_env. This guarantees default_env will + // always being destructed before the ThreadLocalPtr singletons get + // destructed as C++ guarantees that the destructions of static variables + // is in the reverse order of their constructions. + // + // Since static members are destructed in the reverse order + // of their construction, having this call here guarantees that + // the destructor of static PosixEnv will go first, then the + // the singletons of ThreadLocalPtr. + ThreadLocalPtr::InitSingletons(); + CompressionContextCache::InitSingleton(); + INIT_SYNC_POINT_SINGLETONS(); + // ~PosixEnv must be called on exit + //**TODO: Can we make this a STATIC_AVOID_DESTRUCTION? + static PosixEnv default_env; + return &default_env; +} + +// +// Default Posix SystemClock +// +const std::shared_ptr<SystemClock>& SystemClock::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, instance) + (std::make_shared<PosixClock>()); + return instance; +} +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/env/env_test.cc b/src/rocksdb/env/env_test.cc new file mode 100644 index 000000000..f4e9d50b2 --- /dev/null +++ b/src/rocksdb/env/env_test.cc @@ -0,0 +1,3562 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef OS_WIN +#include <sys/ioctl.h> +#endif + +#if defined(ROCKSDB_IOURING_PRESENT) +#include <liburing.h> +#include <sys/uio.h> +#endif + +#include <sys/types.h> + +#include <atomic> +#include <list> +#include <mutex> +#include <unordered_set> + +#ifdef OS_LINUX +#include <fcntl.h> +#include <linux/fs.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <unistd.h> +#endif + +#ifdef ROCKSDB_FALLOCATE_PRESENT +#include <errno.h> +#endif + +#include "db/db_impl/db_impl.h" +#include "env/emulated_clock.h" +#include "env/env_chroot.h" +#include "env/env_encryption_ctr.h" +#include "env/fs_readonly.h" +#include "env/mock_env.h" +#include "env/unique_id_gen.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "options/options_helper.h" +#include "port/malloc.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/env_encryption.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/mock_time_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/counted_fs.h" +#include "utilities/env_timed.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +using port::kPageSize; + +static const int kDelayMicros = 100000; + +struct Deleter { + explicit Deleter(void (*fn)(void*)) : fn_(fn) {} + + void operator()(void* ptr) { + assert(fn_); + assert(ptr); + (*fn_)(ptr); + } + + void (*fn_)(void*); +}; + +extern "C" bool RocksDbIOUringEnable() { return true; } + +std::unique_ptr<char, Deleter> NewAligned(const size_t size, const char ch) { + char* ptr = nullptr; +#ifdef OS_WIN + if (nullptr == + (ptr = reinterpret_cast<char*>(_aligned_malloc(size, kPageSize)))) { + return std::unique_ptr<char, Deleter>(nullptr, Deleter(_aligned_free)); + } + std::unique_ptr<char, Deleter> uptr(ptr, Deleter(_aligned_free)); +#else + if (posix_memalign(reinterpret_cast<void**>(&ptr), kPageSize, size) != 0) { + return std::unique_ptr<char, Deleter>(nullptr, Deleter(free)); + } + std::unique_ptr<char, Deleter> uptr(ptr, Deleter(free)); +#endif + memset(uptr.get(), ch, size); + return uptr; +} + +class EnvPosixTest : public testing::Test { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + bool direct_io_; + EnvPosixTest() : env_(Env::Default()), direct_io_(false) {} + ~EnvPosixTest() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } +}; + +class EnvPosixTestWithParam + : public EnvPosixTest, + public ::testing::WithParamInterface<std::pair<Env*, bool>> { + public: + EnvPosixTestWithParam() { + std::pair<Env*, bool> param_pair = GetParam(); + env_ = param_pair.first; + direct_io_ = param_pair.second; + } + + void WaitThreadPoolsEmpty() { + // Wait until the thread pools are empty. + while (env_->GetThreadPoolQueueLen(Env::Priority::LOW) != 0) { + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + while (env_->GetThreadPoolQueueLen(Env::Priority::HIGH) != 0) { + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + } + + ~EnvPosixTestWithParam() override { WaitThreadPoolsEmpty(); } +}; + +static void SetBool(void* ptr) { + reinterpret_cast<std::atomic<bool>*>(ptr)->store(true); +} + +TEST_F(EnvPosixTest, DISABLED_RunImmediately) { + for (int pri = Env::BOTTOM; pri < Env::TOTAL; ++pri) { + std::atomic<bool> called(false); + env_->SetBackgroundThreads(1, static_cast<Env::Priority>(pri)); + env_->Schedule(&SetBool, &called, static_cast<Env::Priority>(pri)); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called.load()); + } +} + +TEST_F(EnvPosixTest, RunEventually) { + std::atomic<bool> called(false); + env_->StartThread(&SetBool, &called); + env_->WaitForJoin(); + ASSERT_TRUE(called.load()); +} + +#ifdef OS_WIN +TEST_F(EnvPosixTest, AreFilesSame) { + { + bool tmp; + if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) { + fprintf(stderr, + "skipping EnvBasicTestWithParam.AreFilesSame due to " + "unsupported Env::AreFilesSame\n"); + return; + } + } + + const EnvOptions soptions; + auto* env = Env::Default(); + std::string same_file_name = test::PerThreadDBPath(env, "same_file"); + std::string same_file_link_name = same_file_name + "_link"; + + std::unique_ptr<WritableFile> same_file; + ASSERT_OK(env->NewWritableFile(same_file_name, &same_file, soptions)); + same_file->Append("random_data"); + ASSERT_OK(same_file->Flush()); + same_file.reset(); + + ASSERT_OK(env->LinkFile(same_file_name, same_file_link_name)); + bool result = false; + ASSERT_OK(env->AreFilesSame(same_file_name, same_file_link_name, &result)); + ASSERT_TRUE(result); +} +#endif + +#ifdef OS_LINUX +TEST_F(EnvPosixTest, DISABLED_FilePermission) { + // Only works for Linux environment + if (env_ == Env::Default()) { + EnvOptions soptions; + std::vector<std::string> fileNames{ + test::PerThreadDBPath(env_, "testfile"), + test::PerThreadDBPath(env_, "testfile1")}; + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions)); + ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions)); + wfile.reset(); + std::unique_ptr<RandomRWFile> rwfile; + ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions)); + + struct stat sb; + for (const auto& filename : fileNames) { + if (::stat(filename.c_str(), &sb) == 0) { + ASSERT_EQ(sb.st_mode & 0777, 0644); + } + ASSERT_OK(env_->DeleteFile(filename)); + } + + env_->SetAllowNonOwnerAccess(false); + ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions)); + ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions)); + wfile.reset(); + ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions)); + + for (const auto& filename : fileNames) { + if (::stat(filename.c_str(), &sb) == 0) { + ASSERT_EQ(sb.st_mode & 0777, 0600); + } + ASSERT_OK(env_->DeleteFile(filename)); + } + } +} + +TEST_F(EnvPosixTest, LowerThreadPoolCpuPriority) { + std::atomic<CpuPriority> from_priority(CpuPriority::kNormal); + std::atomic<CpuPriority> to_priority(CpuPriority::kNormal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", [&](void* pri) { + from_priority.store(*reinterpret_cast<CpuPriority*>(pri)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::BGThread::AfterSetCpuPriority", [&](void* pri) { + to_priority.store(*reinterpret_cast<CpuPriority*>(pri)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + env_->SetBackgroundThreads(1, Env::BOTTOM); + env_->SetBackgroundThreads(1, Env::HIGH); + + auto RunTask = [&](Env::Priority pool) { + std::atomic<bool> called(false); + env_->Schedule(&SetBool, &called, pool); + for (int i = 0; i < kDelayMicros; i++) { + if (called.load()) { + break; + } + Env::Default()->SleepForMicroseconds(1); + } + ASSERT_TRUE(called.load()); + }; + + { + // Same priority, no-op. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, + CpuPriority::kNormal) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kNormal); + } + + { + // Higher priority, no-op. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kHigh) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kNormal); + } + + { + // Lower priority from kNormal -> kLow. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kLow) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kLow); + } + + { + // Lower priority from kLow -> kIdle. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kIdle) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kLow); + ASSERT_EQ(to_priority, CpuPriority::kIdle); + } + + { + // Lower priority from kNormal -> kIdle for another pool. + env_->LowerThreadPoolCPUPriority(Env::Priority::HIGH, CpuPriority::kIdle) + .PermitUncheckedError(); + RunTask(Env::Priority::HIGH); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kIdle); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif + +TEST_F(EnvPosixTest, MemoryMappedFileBuffer) { + const int kFileBytes = 1 << 15; // 32 KB + std::string expected_data; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + { + std::unique_ptr<WritableFile> wfile; + const EnvOptions soptions; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + Random rnd(301); + expected_data = rnd.RandomString(kFileBytes); + ASSERT_OK(wfile->Append(expected_data)); + } + + std::unique_ptr<MemoryMappedFileBuffer> mmap_buffer; + Status status = env_->NewMemoryMappedFileBuffer(fname, &mmap_buffer); + // it should be supported at least on linux +#if !defined(OS_LINUX) + if (status.IsNotSupported()) { + fprintf(stderr, + "skipping EnvPosixTest.MemoryMappedFileBuffer due to " + "unsupported Env::NewMemoryMappedFileBuffer\n"); + return; + } +#endif // !defined(OS_LINUX) + + ASSERT_OK(status); + ASSERT_NE(nullptr, mmap_buffer.get()); + ASSERT_NE(nullptr, mmap_buffer->GetBase()); + ASSERT_EQ(kFileBytes, mmap_buffer->GetLen()); + std::string actual_data(reinterpret_cast<const char*>(mmap_buffer->GetBase()), + mmap_buffer->GetLen()); + ASSERT_EQ(expected_data, actual_data); +} + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +TEST_F(EnvPosixTest, LoadRocksDBLibrary) { + std::shared_ptr<DynamicLibrary> library; + std::function<void*(void*, const char*)> function; + Status status = env_->LoadLibrary("no-such-library", "", &library); + ASSERT_NOK(status); + ASSERT_EQ(nullptr, library.get()); + status = env_->LoadLibrary("rocksdb", "", &library); + if (status.ok()) { // If we have can find a rocksdb shared library + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(library->LoadFunction("rocksdb_create_default_env", + &function)); // from C definition + ASSERT_NE(nullptr, function); + ASSERT_NOK(library->LoadFunction("no-such-method", &function)); + ASSERT_EQ(nullptr, function); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } else { + ASSERT_EQ(nullptr, library.get()); + } +} +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) +TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) { + std::shared_ptr<DynamicLibrary> library; + std::function<void*(void*, const char*)> function; + ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library)); + ASSERT_EQ(nullptr, library.get()); + ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library)); + ASSERT_EQ(nullptr, library.get()); + Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library); + if (status.ok()) { + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } + char buff[1024]; + std::string cwd = getcwd(buff, sizeof(buff)); + + status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library); + if (status.ok()) { + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } +} +#endif // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION + +TEST_P(EnvPosixTestWithParam, UnSchedule) { + std::atomic<bool> called(false); + env_->SetBackgroundThreads(1, Env::LOW); + + /* Block the low priority queue */ + test::SleepingBackgroundTask sleeping_task, sleeping_task1; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + + /* Schedule another task */ + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task1, + Env::Priority::LOW, &sleeping_task1); + + /* Remove it with a different tag */ + ASSERT_EQ(0, env_->UnSchedule(&called, Env::Priority::LOW)); + + /* Remove it from the queue with the right tag */ + ASSERT_EQ(1, env_->UnSchedule(&sleeping_task1, Env::Priority::LOW)); + + // Unblock background thread + sleeping_task.WakeUp(); + + /* Schedule another task */ + env_->Schedule(&SetBool, &called); + for (int i = 0; i < kDelayMicros; i++) { + if (called.load()) { + break; + } + Env::Default()->SleepForMicroseconds(1); + } + ASSERT_TRUE(called.load()); + + ASSERT_TRUE(!sleeping_task.IsSleeping() && !sleeping_task1.IsSleeping()); + WaitThreadPoolsEmpty(); +} + +// This tests assumes that the last scheduled +// task will run last. In fact, in the allotted +// sleeping time nothing may actually run or they may +// run in any order. The purpose of the test is unclear. +#ifndef OS_WIN +TEST_P(EnvPosixTestWithParam, RunMany) { + env_->SetBackgroundThreads(1, Env::LOW); + std::atomic<int> last_id(0); + + struct CB { + std::atomic<int>* last_id_ptr; // Pointer to shared slot + int id; // Order# for the execution of this callback + + CB(std::atomic<int>* p, int i) : last_id_ptr(p), id(i) {} + + static void Run(void* v) { + CB* cb = reinterpret_cast<CB*>(v); + int cur = cb->last_id_ptr->load(); + ASSERT_EQ(cb->id - 1, cur); + cb->last_id_ptr->store(cb->id); + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + // thread-pool pops a thread function and then run the function, which may + // cause threadpool is empty but the last function is still running. Add a + // dummy function at the end, to make sure the last callback is finished + // before threadpool is empty. + struct DummyCB { + static void Run(void*) {} + }; + env_->Schedule(&DummyCB::Run, nullptr); + + WaitThreadPoolsEmpty(); + ASSERT_EQ(4, last_id.load(std::memory_order_acquire)); +} +#endif + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast<State*>(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST_P(EnvPosixTestWithParam, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); + WaitThreadPoolsEmpty(); +} + +TEST_P(EnvPosixTestWithParam, TwoPools) { + // Data structures to signal tasks to run. + port::Mutex mutex; + port::CondVar cv(&mutex); + bool should_start = false; + + class CB { + public: + CB(const std::string& pool_name, int pool_size, port::Mutex* trigger_mu, + port::CondVar* trigger_cv, bool* _should_start) + : mu_(), + num_running_(0), + num_finished_(0), + pool_size_(pool_size), + pool_name_(pool_name), + trigger_mu_(trigger_mu), + trigger_cv_(trigger_cv), + should_start_(_should_start) {} + + static void Run(void* v) { + CB* cb = reinterpret_cast<CB*>(v); + cb->Run(); + } + + void Run() { + { + MutexLock l(&mu_); + num_running_++; + // make sure we don't have more than pool_size_ jobs running. + ASSERT_LE(num_running_, pool_size_.load()); + } + + { + MutexLock l(trigger_mu_); + while (!(*should_start_)) { + trigger_cv_->Wait(); + } + } + + { + MutexLock l(&mu_); + num_running_--; + num_finished_++; + } + } + + int NumFinished() { + MutexLock l(&mu_); + return num_finished_; + } + + void Reset(int pool_size) { + pool_size_.store(pool_size); + num_finished_ = 0; + } + + private: + port::Mutex mu_; + int num_running_; + int num_finished_; + std::atomic<int> pool_size_; + std::string pool_name_; + port::Mutex* trigger_mu_; + port::CondVar* trigger_cv_; + bool* should_start_; + }; + + const int kLowPoolSize = 2; + const int kHighPoolSize = 4; + const int kJobs = 8; + + CB low_pool_job("low", kLowPoolSize, &mutex, &cv, &should_start); + CB high_pool_job("high", kHighPoolSize, &mutex, &cv, &should_start); + + env_->SetBackgroundThreads(kLowPoolSize); + env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH); + + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // schedule same number of jobs in each pool + for (int i = 0; i < kJobs; i++) { + env_->Schedule(&CB::Run, &low_pool_job); + env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH); + } + // Wait a short while for the jobs to be dispatched. + int sleep_count = 0; + while ((unsigned int)(kJobs - kLowPoolSize) != + env_->GetThreadPoolQueueLen(Env::Priority::LOW) || + (unsigned int)(kJobs - kHighPoolSize) != + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)) { + env_->SleepForMicroseconds(kDelayMicros); + if (++sleep_count > 100) { + break; + } + } + + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen()); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ((unsigned int)(kJobs - kHighPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // Trigger jobs to run. + { + MutexLock l(&mutex); + should_start = true; + cv.SignalAll(); + } + + // wait for all jobs to finish + while (low_pool_job.NumFinished() < kJobs || + high_pool_job.NumFinished() < kJobs) { + env_->SleepForMicroseconds(kDelayMicros); + } + + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // Hold jobs to schedule; + should_start = false; + + // call IncBackgroundThreadsIfNeeded to two pools. One increasing and + // the other decreasing + env_->IncBackgroundThreadsIfNeeded(kLowPoolSize - 1, Env::Priority::LOW); + env_->IncBackgroundThreadsIfNeeded(kHighPoolSize + 1, Env::Priority::HIGH); + high_pool_job.Reset(kHighPoolSize + 1); + low_pool_job.Reset(kLowPoolSize); + + // schedule same number of jobs in each pool + for (int i = 0; i < kJobs; i++) { + env_->Schedule(&CB::Run, &low_pool_job); + env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH); + } + // Wait a short while for the jobs to be dispatched. + sleep_count = 0; + while ((unsigned int)(kJobs - kLowPoolSize) != + env_->GetThreadPoolQueueLen(Env::Priority::LOW) || + (unsigned int)(kJobs - (kHighPoolSize + 1)) != + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)) { + env_->SleepForMicroseconds(kDelayMicros); + if (++sleep_count > 100) { + break; + } + } + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen()); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ((unsigned int)(kJobs - (kHighPoolSize + 1)), + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // Trigger jobs to run. + { + MutexLock l(&mutex); + should_start = true; + cv.SignalAll(); + } + + // wait for all jobs to finish + while (low_pool_job.NumFinished() < kJobs || + high_pool_job.NumFinished() < kJobs) { + env_->SleepForMicroseconds(kDelayMicros); + } + + env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH); + WaitThreadPoolsEmpty(); +} + +TEST_P(EnvPosixTestWithParam, DecreaseNumBgThreads) { + constexpr int kWaitMicros = 60000000; // 1min + + std::vector<test::SleepingBackgroundTask> tasks(10); + + // Set number of thread to 1 first. + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + + // Schedule 3 tasks. 0 running; Task 1, 2 waiting. + for (size_t i = 0; i < 3; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i], + Env::Priority::HIGH); + } + ASSERT_FALSE(tasks[0].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Increase to 2 threads. Task 0, 1 running; 2 waiting + env_->SetBackgroundThreads(2, Env::Priority::HIGH); + ASSERT_FALSE(tasks[1].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Shrink back to 1 thread. Still task 0, 1 running, 2 waiting + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // The last task finishes. Task 0 running, 2 waiting. + tasks[1].WakeUp(); + ASSERT_FALSE(tasks[1].TimedWaitUntilDone(kWaitMicros)); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Increase to 5 threads. Task 0 and 2 running. + env_->SetBackgroundThreads(5, Env::Priority::HIGH); + ASSERT_FALSE(tasks[2].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(tasks[2].IsSleeping()); + + // Change number of threads a couple of times while there is no sufficient + // tasks. + env_->SetBackgroundThreads(7, Env::Priority::HIGH); + tasks[2].WakeUp(); + ASSERT_FALSE(tasks[2].TimedWaitUntilDone(kWaitMicros)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(3, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(5, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + Env::Default()->SleepForMicroseconds(kDelayMicros * 50); + + // Enqueue 5 more tasks. Thread pool size now is 4. + // Task 0, 3, 4, 5 running;6, 7 waiting. + for (size_t i = 3; i < 8; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i], + Env::Priority::HIGH); + } + for (size_t i = 3; i <= 5; i++) { + ASSERT_FALSE(tasks[i].TimedWaitUntilSleeping(kWaitMicros)); + } + ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + ASSERT_TRUE(tasks[3].IsSleeping()); + ASSERT_TRUE(tasks[4].IsSleeping()); + ASSERT_TRUE(tasks[5].IsSleeping()); + ASSERT_TRUE(!tasks[6].IsSleeping()); + ASSERT_TRUE(!tasks[7].IsSleeping()); + + // Wake up task 0, 3 and 4. Task 5, 6, 7 running. + tasks[0].WakeUp(); + tasks[3].WakeUp(); + tasks[4].WakeUp(); + + for (size_t i = 5; i < 8; i++) { + ASSERT_FALSE(tasks[i].TimedWaitUntilSleeping(kWaitMicros)); + } + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + for (size_t i = 5; i < 8; i++) { + ASSERT_TRUE(tasks[i].IsSleeping()); + } + + // Shrink back to 1 thread. Still task 5, 6, 7 running + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(tasks[5].IsSleeping()); + ASSERT_TRUE(tasks[6].IsSleeping()); + ASSERT_TRUE(tasks[7].IsSleeping()); + + // Wake up task 6. Task 5, 7 running + tasks[6].WakeUp(); + ASSERT_FALSE(tasks[6].TimedWaitUntilDone(kWaitMicros)); + ASSERT_TRUE(tasks[5].IsSleeping()); + ASSERT_TRUE(!tasks[6].IsSleeping()); + ASSERT_TRUE(tasks[7].IsSleeping()); + + // Wake up threads 7. Task 5 running + tasks[7].WakeUp(); + ASSERT_FALSE(tasks[7].TimedWaitUntilDone(kWaitMicros)); + ASSERT_TRUE(!tasks[7].IsSleeping()); + + // Enqueue thread 8 and 9. Task 5 running; one of 8, 9 might be running. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[8], + Env::Priority::HIGH); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[9], + Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_GT(env_->GetThreadPoolQueueLen(Env::Priority::HIGH), (unsigned int)0); + ASSERT_TRUE(!tasks[8].IsSleeping() || !tasks[9].IsSleeping()); + + // Increase to 4 threads. Task 5, 8, 9 running. + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[8].IsSleeping()); + ASSERT_TRUE(tasks[9].IsSleeping()); + + // Shrink to 1 thread + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + + // Wake up thread 9. + tasks[9].WakeUp(); + ASSERT_FALSE(tasks[9].TimedWaitUntilDone(kWaitMicros)); + ASSERT_TRUE(!tasks[9].IsSleeping()); + ASSERT_TRUE(tasks[8].IsSleeping()); + + // Wake up thread 8 + tasks[8].WakeUp(); + ASSERT_FALSE(tasks[8].TimedWaitUntilDone(kWaitMicros)); + ASSERT_TRUE(!tasks[8].IsSleeping()); + + // Wake up the last thread + tasks[5].WakeUp(); + ASSERT_FALSE(tasks[5].TimedWaitUntilDone(kWaitMicros)); + WaitThreadPoolsEmpty(); +} + +TEST_P(EnvPosixTestWithParam, ReserveThreads) { + // Initialize the background thread to 1 in case other threads exist + // from the last unit test + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + ASSERT_EQ(env_->GetBackgroundThreads(Env::HIGH), 1); + constexpr int kWaitMicros = 10000000; // 10seconds + std::vector<test::SleepingBackgroundTask> tasks(4); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Set the sync point to ensure thread 0 can terminate + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ThreadPoolImpl::BGThread::Termination:th0", + "EnvTest::ReserveThreads:0"}}); + // Empty the thread pool to ensure all the threads can start later + env_->SetBackgroundThreads(0, Env::Priority::HIGH); + TEST_SYNC_POINT("EnvTest::ReserveThreads:0"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + // Set the sync point to ensure threads start and pass the sync point + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ThreadPoolImpl::BGThread::Start:th0", "EnvTest::ReserveThreads:1"}, + {"ThreadPoolImpl::BGThread::Start:th1", "EnvTest::ReserveThreads:2"}, + {"ThreadPoolImpl::BGThread::Start:th2", "EnvTest::ReserveThreads:3"}, + {"ThreadPoolImpl::BGThread::Start:th3", "EnvTest::ReserveThreads:4"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Set number of thread to 3 first. + env_->SetBackgroundThreads(3, Env::Priority::HIGH); + ASSERT_EQ(env_->GetBackgroundThreads(Env::HIGH), 3); + // Add sync points to ensure all 3 threads start + TEST_SYNC_POINT("EnvTest::ReserveThreads:1"); + TEST_SYNC_POINT("EnvTest::ReserveThreads:2"); + TEST_SYNC_POINT("EnvTest::ReserveThreads:3"); + // Reserve 2 threads + ASSERT_EQ(2, env_->ReserveThreads(2, Env::Priority::HIGH)); + + // Schedule 3 tasks. Task 0 running (in this context, doing + // SleepingBackgroundTask); Task 1, 2 waiting; 3 reserved threads. + for (size_t i = 0; i < 3; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i], + Env::Priority::HIGH); + } + ASSERT_FALSE(tasks[0].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[0].IsSleeping()); + ASSERT_TRUE(!tasks[1].IsSleeping()); + ASSERT_TRUE(!tasks[2].IsSleeping()); + + // Release 2 threads. Task 0, 1, 2 running; 0 reserved thread. + ASSERT_EQ(2, env_->ReleaseThreads(2, Env::Priority::HIGH)); + ASSERT_FALSE(tasks[1].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_FALSE(tasks[2].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(tasks[1].IsSleeping()); + ASSERT_TRUE(tasks[2].IsSleeping()); + // No more threads can be reserved + ASSERT_EQ(0, env_->ReserveThreads(3, Env::Priority::HIGH)); + // Expand the number of background threads so that the last thread + // is waiting + env_->SetBackgroundThreads(4, Env::Priority::HIGH); + // Add sync point to ensure the 4th thread starts + TEST_SYNC_POINT("EnvTest::ReserveThreads:4"); + // As the thread pool is expanded, we can reserve one more thread + ASSERT_EQ(1, env_->ReserveThreads(3, Env::Priority::HIGH)); + // No more threads can be reserved + ASSERT_EQ(0, env_->ReserveThreads(3, Env::Priority::HIGH)); + + // Reset the sync points for the next iteration in BGThread or the + // next time Submit() is called + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ThreadPoolImpl::BGThread::WaitingThreadsInc", + "EnvTest::ReserveThreads:5"}, + {"ThreadPoolImpl::BGThread::Termination", "EnvTest::ReserveThreads:6"}, + {"ThreadPoolImpl::Submit::Enqueue", "EnvTest::ReserveThreads:7"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + tasks[0].WakeUp(); + ASSERT_FALSE(tasks[0].TimedWaitUntilDone(kWaitMicros)); + // Add sync point to ensure the number of waiting threads increases + TEST_SYNC_POINT("EnvTest::ReserveThreads:5"); + // 1 more thread can be reserved + ASSERT_EQ(1, env_->ReserveThreads(3, Env::Priority::HIGH)); + // 2 reserved threads now + + // Currently, two threads are blocked since the number of waiting + // threads is equal to the number of reserved threads (i.e., 2). + // If we reduce the number of background thread to 1, at least one thread + // will be the last excessive thread (here we have no control over the + // number of excessive threads because thread order does not + // necessarily follows the schedule order, but we ensure that the last thread + // shall not run any task by expanding the thread pool after we schedule + // the tasks), and thus they(it) become(s) unblocked, the number of waiting + // threads decreases to 0 or 1, but the number of reserved threads is still 2 + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + + // Task 1,2 running; 2 reserved threads, however, in fact, we only have + // 0 or 1 waiting thread in the thread pool, proved by the + // following test, we CANNOT reserve 2 threads even though we just + // release 2 + TEST_SYNC_POINT("EnvTest::ReserveThreads:6"); + ASSERT_EQ(2, env_->ReleaseThreads(2, Env::Priority::HIGH)); + ASSERT_GT(2, env_->ReserveThreads(2, Env::Priority::HIGH)); + + // Every new task will be put into the queue at this point + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[3], + Env::Priority::HIGH); + TEST_SYNC_POINT("EnvTest::ReserveThreads:7"); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + ASSERT_TRUE(!tasks[3].IsSleeping()); + + // Set the number of threads to 3 so that Task 3 can dequeue + env_->SetBackgroundThreads(3, Env::Priority::HIGH); + // Wakup Task 1 + tasks[1].WakeUp(); + ASSERT_FALSE(tasks[1].TimedWaitUntilDone(kWaitMicros)); + // Task 2, 3 running (Task 3 dequeue); 0 or 1 reserved thread + ASSERT_FALSE(tasks[3].TimedWaitUntilSleeping(kWaitMicros)); + ASSERT_TRUE(tasks[3].IsSleeping()); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // At most 1 thread can be released + ASSERT_GT(2, env_->ReleaseThreads(3, Env::Priority::HIGH)); + tasks[2].WakeUp(); + ASSERT_FALSE(tasks[2].TimedWaitUntilDone(kWaitMicros)); + tasks[3].WakeUp(); + ASSERT_FALSE(tasks[3].TimedWaitUntilDone(kWaitMicros)); + WaitThreadPoolsEmpty(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#if (defined OS_LINUX || defined OS_WIN) +namespace { +bool IsSingleVarint(const std::string& s) { + Slice slice(s); + + uint64_t v; + if (!GetVarint64(&slice, &v)) { + return false; + } + + return slice.size() == 0; +} + +bool IsUniqueIDValid(const std::string& s) { + return !s.empty() && !IsSingleVarint(s); +} + +const size_t MAX_ID_SIZE = 100; +char temp_id[MAX_ID_SIZE]; + +} // namespace + +// Determine whether we can use the FS_IOC_GETVERSION ioctl +// on a file in directory DIR. Create a temporary file therein, +// try to apply the ioctl (save that result), cleanup and +// return the result. Return true if it is supported, and +// false if anything fails. +// Note that this function "knows" that dir has just been created +// and is empty, so we create a simply-named test file: "f". +bool ioctl_support__FS_IOC_GETVERSION(const std::string& dir) { +#ifdef OS_WIN + return true; +#else + const std::string file = dir + "/f"; + int fd; + do { + fd = open(file.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + long int version; + bool ok = (fd >= 0 && ioctl(fd, FS_IOC_GETVERSION, &version) >= 0); + + close(fd); + unlink(file.c_str()); + + return ok; +#endif +} + +// To ensure that Env::GetUniqueId-related tests work correctly, the files +// should be stored in regular storage like "hard disk" or "flash device", +// and not on a tmpfs file system (like /dev/shm and /tmp on some systems). +// Otherwise we cannot get the correct id. +// +// This function serves as the replacement for test::TmpDir(), which may be +// customized to be on a file system that doesn't work with GetUniqueId(). + +class IoctlFriendlyTmpdir { + public: + explicit IoctlFriendlyTmpdir() { + char dir_buf[100]; + + const char* fmt = "%s/rocksdb.XXXXXX"; + const char* tmp = getenv("TEST_IOCTL_FRIENDLY_TMPDIR"); + +#ifdef OS_WIN +#define rmdir _rmdir + if (tmp == nullptr) { + tmp = getenv("TMP"); + } + + snprintf(dir_buf, sizeof dir_buf, fmt, tmp); + auto result = _mktemp(dir_buf); + assert(result != nullptr); + BOOL ret = CreateDirectory(dir_buf, NULL); + assert(ret == TRUE); + dir_ = dir_buf; +#else + std::list<std::string> candidate_dir_list = {"/var/tmp", "/tmp"}; + + // If $TEST_IOCTL_FRIENDLY_TMPDIR/rocksdb.XXXXXX fits, use + // $TEST_IOCTL_FRIENDLY_TMPDIR; subtract 2 for the "%s", and + // add 1 for the trailing NUL byte. + if (tmp && strlen(tmp) + strlen(fmt) - 2 + 1 <= sizeof dir_buf) { + // use $TEST_IOCTL_FRIENDLY_TMPDIR value + candidate_dir_list.push_front(tmp); + } + + for (const std::string& d : candidate_dir_list) { + snprintf(dir_buf, sizeof dir_buf, fmt, d.c_str()); + if (mkdtemp(dir_buf)) { + if (ioctl_support__FS_IOC_GETVERSION(dir_buf)) { + dir_ = dir_buf; + return; + } else { + // Diagnose ioctl-related failure only if this is the + // directory specified via that envvar. + if (tmp && tmp == d) { + fprintf(stderr, + "TEST_IOCTL_FRIENDLY_TMPDIR-specified directory is " + "not suitable: %s\n", + d.c_str()); + } + rmdir(dir_buf); // ignore failure + } + } else { + // mkdtemp failed: diagnose it, but don't give up. + fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(), + errnoStr(errno).c_str()); + } + } + + // check if it's running test within a docker container, in which case, the + // file system inside `overlayfs` may not support FS_IOC_GETVERSION + // skip the tests + struct stat buffer; + if (stat("/.dockerenv", &buffer) == 0) { + is_supported_ = false; + return; + } + + fprintf(stderr, + "failed to find an ioctl-friendly temporary directory;" + " specify one via the TEST_IOCTL_FRIENDLY_TMPDIR envvar\n"); + std::abort(); +#endif + } + + ~IoctlFriendlyTmpdir() { rmdir(dir_.c_str()); } + + const std::string& name() const { return dir_; } + + bool is_supported() const { return is_supported_; } + + private: + std::string dir_; + + bool is_supported_ = true; +}; + +#ifndef ROCKSDB_LITE +TEST_F(EnvPosixTest, PositionedAppend) { + std::unique_ptr<WritableFile> writable_file; + EnvOptions options; + options.use_direct_writes = true; + options.use_mmap_writes = false; + std::string fname = test::PerThreadDBPath(env_, "positioned_append"); + SetupSyncPointsToMockDirectIO(); + + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, options)); + const size_t kBlockSize = 4096; + const size_t kDataSize = kPageSize; + // Write a page worth of 'a' + auto data_ptr = NewAligned(kDataSize, 'a'); + Slice data_a(data_ptr.get(), kDataSize); + ASSERT_OK(writable_file->PositionedAppend(data_a, 0U)); + // Write a page worth of 'b' right after the first sector + data_ptr = NewAligned(kDataSize, 'b'); + Slice data_b(data_ptr.get(), kDataSize); + ASSERT_OK(writable_file->PositionedAppend(data_b, kBlockSize)); + ASSERT_OK(writable_file->Close()); + // The file now has 1 sector worth of a followed by a page worth of b + + // Verify the above + std::unique_ptr<SequentialFile> seq_file; + ASSERT_OK(env_->NewSequentialFile(fname, &seq_file, options)); + size_t scratch_len = kPageSize * 2; + std::unique_ptr<char[]> scratch(new char[scratch_len]); + Slice result; + ASSERT_OK(seq_file->Read(scratch_len, &result, scratch.get())); + ASSERT_EQ(kPageSize + kBlockSize, result.size()); + ASSERT_EQ('a', result[kBlockSize - 1]); + ASSERT_EQ('b', result[kBlockSize]); +} +#endif // !ROCKSDB_LITE + +// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can +// handle a return value of zero but this test case cannot. +#ifndef OS_WIN +TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) { + // Create file. + if (env_ == Env::Default()) { + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + IoctlFriendlyTmpdir ift; + if (!ift.is_supported()) { + ROCKSDB_GTEST_BYPASS( + "FS_IOC_GETVERSION is not supported by the filesystem"); + return; + } + std::string fname = ift.name() + "/testfile"; + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + std::unique_ptr<RandomAccessFile> file; + + // Get Unique ID + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id1(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id1)); + + // Get Unique ID again + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id2(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id2)); + + // Get Unique ID again after waiting some time. + env_->SleepForMicroseconds(1000000); + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id3(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id3)); + + // Check IDs are the same. + ASSERT_EQ(unique_id1, unique_id2); + ASSERT_EQ(unique_id2, unique_id3); + + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + } +} +#endif // !defined(OS_WIN) + +// only works in linux platforms +#ifdef ROCKSDB_FALLOCATE_PRESENT +TEST_P(EnvPosixTestWithParam, AllocateTest) { + if (env_ == Env::Default()) { + SetupSyncPointsToMockDirectIO(); + std::string fname = test::PerThreadDBPath(env_, "preallocate_testfile"); + // Try fallocate in a file to see whether the target file system supports + // it. + // Skip the test if fallocate is not supported. + std::string fname_test_fallocate = + test::PerThreadDBPath(env_, "preallocate_testfile_2"); + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + ASSERT_GT(fd, 0); + + int alloc_status = fallocate(fd, 0, 0, 1); + + int err_number = 0; + if (alloc_status != 0) { + err_number = errno; + fprintf(stderr, "Warning: fallocate() fails, %s\n", + errnoStr(err_number).c_str()); + } + close(fd); + ASSERT_OK(env_->DeleteFile(fname_test_fallocate)); + if (alloc_status != 0 && err_number == EOPNOTSUPP) { + // The filesystem containing the file does not support fallocate + return; + } + + EnvOptions soptions; + soptions.use_mmap_writes = false; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + // allocate 100 MB + size_t kPreallocateSize = 100 * 1024 * 1024; + size_t kBlockSize = 512; + size_t kDataSize = 1024 * 1024; + auto data_ptr = NewAligned(kDataSize, 'A'); + Slice data(data_ptr.get(), kDataSize); + wfile->SetPreallocationBlockSize(kPreallocateSize); + wfile->PrepareWrite(wfile->GetFileSize(), kDataSize); + ASSERT_OK(wfile->Append(data)); + ASSERT_OK(wfile->Flush()); + + struct stat f_stat; + ASSERT_EQ(stat(fname.c_str(), &f_stat), 0); + ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size); + // verify that blocks are preallocated + // Note here that we don't check the exact number of blocks preallocated -- + // we only require that number of allocated blocks is at least what we + // expect. + // It looks like some FS give us more blocks that we asked for. That's fine. + // It might be worth investigating further. + ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks); + + // close the file, should deallocate the blocks + wfile.reset(); + + stat(fname.c_str(), &f_stat); + ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size); + // verify that preallocated blocks were deallocated on file close + // Because the FS might give us more blocks, we add a full page to the size + // and expect the number of blocks to be less or equal to that. + ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize, + (unsigned int)f_stat.st_blocks); + } +} +#endif // ROCKSDB_FALLOCATE_PRESENT + +// Returns true if any of the strings in ss are the prefix of another string. +bool HasPrefix(const std::unordered_set<std::string>& ss) { + for (const std::string& s : ss) { + if (s.empty()) { + return true; + } + for (size_t i = 1; i < s.size(); ++i) { + if (ss.count(s.substr(0, i)) != 0) { + return true; + } + } + } + return false; +} + +// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can +// handle a return value of zero but this test case cannot. +#ifndef OS_WIN +TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) { + if (env_ == Env::Default()) { + // Check whether a bunch of concurrently existing files have unique IDs. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + + // Create the files + IoctlFriendlyTmpdir ift; + if (!ift.is_supported()) { + ROCKSDB_GTEST_BYPASS( + "FS_IOC_GETVERSION is not supported by the filesystem"); + return; + } + std::vector<std::string> fnames; + for (int i = 0; i < 1000; ++i) { + fnames.push_back(ift.name() + "/" + "testfile" + std::to_string(i)); + + // Create file. + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions)); + } + + // Collect and check whether the IDs are unique. + std::unordered_set<std::string> ids; + for (const std::string& fname : fnames) { + std::unique_ptr<RandomAccessFile> file; + std::string unique_id; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + } + + // Delete the files + for (const std::string& fname : fnames) { + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); + } +} + +// TODO: Disable the flaky test, it's a known issue that ext4 may return same +// key after file deletion. The issue is tracked in #7405, #7470. +TEST_P(EnvPosixTestWithParam, DISABLED_RandomAccessUniqueIDDeletes) { + if (env_ == Env::Default()) { + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + + IoctlFriendlyTmpdir ift; + if (!ift.is_supported()) { + ROCKSDB_GTEST_BYPASS( + "FS_IOC_GETVERSION is not supported by the filesystem"); + return; + } + std::string fname = ift.name() + "/" + "testfile"; + + // Check that after file is deleted we don't get same ID again in a new + // file. + std::unordered_set<std::string> ids; + for (int i = 0; i < 1000; ++i) { + // Create file. + { + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + } + + // Get Unique ID + std::string unique_id; + { + std::unique_ptr<RandomAccessFile> file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + } + + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); + } +} +#endif // !defined(OS_WIN) + +TEST_P(EnvPosixTestWithParam, MultiRead) { + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kSectorSize = 4096; + const size_t kNumSectors = 8; + + // Create file. + { + std::unique_ptr<WritableFile> wfile; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_writes) { + soptions.use_direct_writes = false; + } +#endif + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + for (size_t i = 0; i < kNumSectors; ++i) { + auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1)); + Slice slice(data.get(), kSectorSize); + ASSERT_OK(wfile->Append(slice)); + } + ASSERT_OK(wfile->Close()); + } + + // More attempts to simulate more partial result sequences. + for (uint32_t attempt = 0; attempt < 20; attempt++) { + // Random Read + Random rnd(301 + attempt); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", [&](void* arg) { + if (attempt > 0) { + // No failure in the first attempt. + size_t& bytes_read = *static_cast<size_t*>(arg); + if (rnd.OneIn(4)) { + bytes_read = 0; + } else if (rnd.OneIn(3)) { + bytes_read = static_cast<size_t>( + rnd.Uniform(static_cast<int>(bytes_read))); + } + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + std::unique_ptr<RandomAccessFile> file; + std::vector<ReadRequest> reqs(3); + std::vector<std::unique_ptr<char, Deleter>> data; + uint64_t offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + reqs[i].offset = offset; + offset += 2 * kSectorSize; + reqs[i].len = kSectorSize; + data.emplace_back(NewAligned(kSectorSize, 0)); + reqs[i].scratch = data.back().get(); + } +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; + } +#endif + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + for (size_t i = 0; i < reqs.size(); ++i) { + auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i * 2 + 1)); + ASSERT_OK(reqs[i].status); + ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { + // In this test we don't do aligned read, so it doesn't work for + // direct I/O case. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kTotalSize = 81920; + Random rnd(301); + std::string expected_data = rnd.RandomString(kTotalSize); + + // Create file. + { + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile->Append(expected_data)); + ASSERT_OK(wfile->Close()); + } + + // More attempts to simulate more partial result sequences. + for (uint32_t attempt = 0; attempt < 25; attempt++) { + // Right now kIoUringDepth is hard coded as 256, so we need very large + // number of keys to cover the case of multiple rounds of submissions. + // Right now the test latency is still acceptable. If it ends up with + // too long, we can modify the io uring depth with SyncPoint here. + const int num_reads = rnd.Uniform(512) + 1; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", [&](void* arg) { + if (attempt > 5) { + // Improve partial result rates in second half of the run to + // cover the case of repeated partial results. + int odd = (attempt < 15) ? num_reads / 2 : 4; + // No failure in first several attempts. + size_t& bytes_read = *static_cast<size_t*>(arg); + if (rnd.OneIn(odd)) { + bytes_read = 0; + } else if (rnd.OneIn(odd / 2)) { + bytes_read = static_cast<size_t>( + rnd.Uniform(static_cast<int>(bytes_read))); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Generate (offset, len) pairs + std::set<int> start_offsets; + for (int i = 0; i < num_reads; i++) { + int rnd_off; + // No repeat offsets. + while (start_offsets.find(rnd_off = rnd.Uniform(81920)) != + start_offsets.end()) { + } + start_offsets.insert(rnd_off); + } + std::vector<size_t> offsets; + std::vector<size_t> lens; + // std::set already sorted the offsets. + for (int so : start_offsets) { + offsets.push_back(so); + } + for (size_t i = 0; i + 1 < offsets.size(); i++) { + lens.push_back(static_cast<size_t>( + rnd.Uniform(static_cast<int>(offsets[i + 1] - offsets[i])) + 1)); + } + lens.push_back(static_cast<size_t>( + rnd.Uniform(static_cast<int>(kTotalSize - offsets.back())) + 1)); + ASSERT_EQ(num_reads, lens.size()); + + // Create requests + std::vector<std::string> scratches; + scratches.reserve(num_reads); + std::vector<ReadRequest> reqs(num_reads); + for (size_t i = 0; i < reqs.size(); ++i) { + reqs[i].offset = offsets[i]; + reqs[i].len = lens[i]; + scratches.emplace_back(reqs[i].len, ' '); + reqs[i].scratch = const_cast<char*>(scratches.back().data()); + } + + // Query the data + std::unique_ptr<RandomAccessFile> file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + + // Validate results + for (int i = 0; i < num_reads; ++i) { + ASSERT_OK(reqs[i].status); + ASSERT_EQ( + Slice(expected_data.data() + offsets[i], lens[i]).ToString(true), + reqs[i].result.ToString(true)); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(EnvPosixTest, NonAlignedDirectIOMultiReadBeyondFileSize) { + EnvOptions soptions; + soptions.use_direct_reads = true; + soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + Random rnd(301); + std::unique_ptr<WritableFile> wfile; + size_t alignment = 0; + // Create file. + { + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + auto data_ptr = NewAligned(4095, 'b'); + Slice data_b(data_ptr.get(), 4095); + ASSERT_OK(wfile->PositionedAppend(data_b, 0U)); + ASSERT_OK(wfile->Close()); + } + +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) + if (soptions.use_direct_reads) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", [&](void* arg) { + int* val = static_cast<int*>(arg); + *val &= ~O_DIRECT; + }); + } +#endif + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + const int num_reads = 2; + // Create requests + std::vector<std::string> scratches; + scratches.reserve(num_reads); + std::vector<ReadRequest> reqs(num_reads); + + std::unique_ptr<RandomAccessFile> file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + alignment = file->GetRequiredBufferAlignment(); + ASSERT_EQ(num_reads, reqs.size()); + + std::vector<std::unique_ptr<char, Deleter>> data; + + std::vector<size_t> offsets = {0, 2047}; + std::vector<size_t> lens = {2047, 4096 - 2047}; + + for (size_t i = 0; i < num_reads; i++) { + // Do alignment + reqs[i].offset = static_cast<uint64_t>(TruncateToPageBoundary( + alignment, static_cast<size_t>(/*offset=*/offsets[i]))); + reqs[i].len = + Roundup(static_cast<size_t>(/*offset=*/offsets[i]) + /*length=*/lens[i], + alignment) - + reqs[i].offset; + + size_t new_capacity = Roundup(reqs[i].len, alignment); + data.emplace_back(NewAligned(new_capacity, 0)); + reqs[i].scratch = data.back().get(); + } + + // Query the data + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + + // Validate results + for (size_t i = 0; i < num_reads; ++i) { + ASSERT_OK(reqs[i].status); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // ROCKSDB_LITE + +#if defined(ROCKSDB_IOURING_PRESENT) +void GenerateFilesAndRequest(Env* env, const std::string& fname, + std::vector<ReadRequest>* ret_reqs, + std::vector<std::string>* scratches) { + const size_t kTotalSize = 81920; + Random rnd(301); + std::string expected_data = rnd.RandomString(kTotalSize); + + // Create file. + { + std::unique_ptr<WritableFile> wfile; + ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions())); + ASSERT_OK(wfile->Append(expected_data)); + ASSERT_OK(wfile->Close()); + } + + // Right now kIoUringDepth is hard coded as 256, so we need very large + // number of keys to cover the case of multiple rounds of submissions. + // Right now the test latency is still acceptable. If it ends up with + // too long, we can modify the io uring depth with SyncPoint here. + const int num_reads = 3; + std::vector<size_t> offsets = {10000, 20000, 30000}; + std::vector<size_t> lens = {3000, 200, 100}; + + // Create requests + scratches->reserve(num_reads); + std::vector<ReadRequest>& reqs = *ret_reqs; + reqs.resize(num_reads); + for (int i = 0; i < num_reads; ++i) { + reqs[i].offset = offsets[i]; + reqs[i].len = lens[i]; + scratches->emplace_back(reqs[i].len, ' '); + reqs[i].scratch = const_cast<char*>(scratches->back().data()); + } +} + +TEST_F(EnvPosixTest, MultiReadIOUringError) { + // In this test we don't do aligned read, so we can't do direct I/O. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + std::vector<std::string> scratches; + std::vector<ReadRequest> reqs; + GenerateFilesAndRequest(env_, fname, &reqs, &scratches); + // Query the data + std::unique_ptr<RandomAccessFile> file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + + bool io_uring_wait_cqe_called = false; + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", + [&](void* arg) { + if (!io_uring_wait_cqe_called) { + io_uring_wait_cqe_called = true; + ssize_t& ret = *(static_cast<ssize_t*>(arg)); + ret = 1; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = file->MultiRead(reqs.data(), reqs.size()); + if (io_uring_wait_cqe_called) { + ASSERT_NOK(s); + } else { + s.PermitUncheckedError(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(EnvPosixTest, MultiReadIOUringError2) { + // In this test we don't do aligned read, so we can't do direct I/O. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + std::vector<std::string> scratches; + std::vector<ReadRequest> reqs; + GenerateFilesAndRequest(env_, fname, &reqs, &scratches); + // Query the data + std::unique_ptr<RandomAccessFile> file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + + bool io_uring_submit_and_wait_called = false; + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + [&](void* arg) { + io_uring_submit_and_wait_called = true; + ssize_t* ret = static_cast<ssize_t*>(arg); + (*ret)--; + }); + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + [&](void* arg) { + struct io_uring* iu = static_cast<struct io_uring*>(arg); + struct io_uring_cqe* cqe; + assert(io_uring_wait_cqe(iu, &cqe) == 0); + io_uring_cqe_seen(iu, cqe); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = file->MultiRead(reqs.data(), reqs.size()); + if (io_uring_submit_and_wait_called) { + ASSERT_NOK(s); + } else { + s.PermitUncheckedError(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // ROCKSDB_IOURING_PRESENT + +// Only works in linux platforms +#ifdef OS_WIN +TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) { +#else +TEST_P(EnvPosixTestWithParam, InvalidateCache) { +#endif + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kSectorSize = 512; + auto data = NewAligned(kSectorSize, 0); + Slice slice(data.get(), kSectorSize); + + // Create file. + { + std::unique_ptr<WritableFile> wfile; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_writes) { + soptions.use_direct_writes = false; + } +#endif + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile->Append(slice)); + ASSERT_OK(wfile->InvalidateCache(0, 0)); + ASSERT_OK(wfile->Close()); + } + + // Random Read + { + std::unique_ptr<RandomAccessFile> file; + auto scratch = NewAligned(kSectorSize, 0); + Slice result; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; + } +#endif + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->Read(0, kSectorSize, &result, scratch.get())); + ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0); + ASSERT_OK(file->InvalidateCache(0, 11)); + ASSERT_OK(file->InvalidateCache(0, 0)); + } + + // Sequential Read + { + std::unique_ptr<SequentialFile> file; + auto scratch = NewAligned(kSectorSize, 0); + Slice result; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; + } +#endif + ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); + if (file->use_direct_io()) { + ASSERT_OK(file->PositionedRead(0, kSectorSize, &result, scratch.get())); + } else { + ASSERT_OK(file->Read(kSectorSize, &result, scratch.get())); + } + ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0); + ASSERT_OK(file->InvalidateCache(0, 11)); + ASSERT_OK(file->InvalidateCache(0, 0)); + } + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); +} +#endif // OS_LINUX || OS_WIN + +class TestLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + log_count++; + + char new_format[550]; + std::fill_n(new_format, sizeof(new_format), '2'); + { + va_list backup_ap; + va_copy(backup_ap, ap); + int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap); + // 48 bytes for extra information + bytes allocated + +// When we have n == -1 there is not a terminating zero expected +#ifdef OS_WIN + if (n < 0) { + char_0_count++; + } +#endif + + if (new_format[0] == '[') { + // "[DEBUG] " + ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(port::TimeVal)))); + } else { + ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(port::TimeVal)))); + } + va_end(backup_ap); + } + + for (size_t i = 0; i < sizeof(new_format); i++) { + if (new_format[i] == 'x') { + char_x_count++; + } else if (new_format[i] == '\0') { + char_0_count++; + } + } + } + int log_count; + int char_x_count; + int char_0_count; +}; + +TEST_P(EnvPosixTestWithParam, LogBufferTest) { + TestLogger test_logger; + test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + test_logger.log_count = 0; + test_logger.char_x_count = 0; + test_logger.char_0_count = 0; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger); + LogBuffer log_buffer_debug(DEBUG_LEVEL, &test_logger); + + char bytes200[200]; + std::fill_n(bytes200, sizeof(bytes200), '1'); + bytes200[sizeof(bytes200) - 1] = '\0'; + char bytes600[600]; + std::fill_n(bytes600, sizeof(bytes600), '1'); + bytes600[sizeof(bytes600) - 1] = '\0'; + char bytes9000[9000]; + std::fill_n(bytes9000, sizeof(bytes9000), '1'); + bytes9000[sizeof(bytes9000) - 1] = '\0'; + + ROCKS_LOG_BUFFER(&log_buffer, "x%sx", bytes200); + ROCKS_LOG_BUFFER(&log_buffer, "x%sx", bytes600); + ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx%sx", bytes200, bytes200, bytes200); + ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx", bytes200, bytes600); + ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx", bytes600, bytes9000); + + ROCKS_LOG_BUFFER(&log_buffer_debug, "x%sx", bytes200); + test_logger.SetInfoLogLevel(DEBUG_LEVEL); + ROCKS_LOG_BUFFER(&log_buffer_debug, "x%sx%sx%sx", bytes600, bytes9000, + bytes200); + + ASSERT_EQ(0, test_logger.log_count); + log_buffer.FlushBufferToLog(); + log_buffer_debug.FlushBufferToLog(); + ASSERT_EQ(6, test_logger.log_count); + ASSERT_EQ(6, test_logger.char_0_count); + ASSERT_EQ(10, test_logger.char_x_count); +} + +class TestLogger2 : public Logger { + public: + explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {} + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + char new_format[2000]; + std::fill_n(new_format, sizeof(new_format), '2'); + { + va_list backup_ap; + va_copy(backup_ap, ap); + int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap); + // 48 bytes for extra information + bytes allocated + ASSERT_TRUE(n <= + 48 + static_cast<int>(max_log_size_ - sizeof(port::TimeVal))); + ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(port::TimeVal))); + va_end(backup_ap); + } + } + size_t max_log_size_; +}; + +TEST_P(EnvPosixTestWithParam, LogBufferMaxSizeTest) { + char bytes9000[9000]; + std::fill_n(bytes9000, sizeof(bytes9000), '1'); + bytes9000[sizeof(bytes9000) - 1] = '\0'; + + for (size_t max_log_size = 256; max_log_size <= 1024; + max_log_size += 1024 - 256) { + TestLogger2 test_logger(max_log_size); + test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger); + ROCKS_LOG_BUFFER_MAX_SZ(&log_buffer, max_log_size, "%s", bytes9000); + log_buffer.FlushBufferToLog(); + } +} + +TEST_P(EnvPosixTestWithParam, Preallocation) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + const std::string src = test::PerThreadDBPath(env_, "testfile"); + std::unique_ptr<WritableFile> srcfile; + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) + if (soptions.use_direct_writes) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast<int*>(arg); + *val &= ~O_DIRECT; + }); + } +#endif + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + size_t kStrSize = 4096; + auto data = NewAligned(kStrSize, 'A'); + Slice str(data.get(), kStrSize); + srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize); + ASSERT_OK(srcfile->Append(str)); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + { + auto buf_ptr = NewAligned(block_size, ' '); + Slice buf(buf_ptr.get(), block_size); + srcfile->PrepareWrite(srcfile->GetFileSize(), block_size); + ASSERT_OK(srcfile->Append(buf)); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + } + + // Write five more blocks at once, ensure we're where we need to be. + { + auto buf_ptr = NewAligned(block_size * 5, ' '); + Slice buf = Slice(buf_ptr.get(), block_size * 5); + srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); + ASSERT_OK(srcfile->Append(buf)); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); +} + +// Test that the two ways to get children file attributes (in bulk or +// individually) behave consistently. +TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + const int kNumChildren = 10; + + std::string data; + std::string test_base_dir = test::PerThreadDBPath(env_, "env_test_chr_attr"); + env_->CreateDir(test_base_dir).PermitUncheckedError(); + for (int i = 0; i < kNumChildren; ++i) { + const std::string path = test_base_dir + "/testfile_" + std::to_string(i); + std::unique_ptr<WritableFile> file; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) + if (soptions.use_direct_writes) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast<int*>(arg); + *val &= ~O_DIRECT; + }); + } +#endif + ASSERT_OK(env_->NewWritableFile(path, &file, soptions)); + auto buf_ptr = NewAligned(data.size(), 'T'); + Slice buf(buf_ptr.get(), data.size()); + ASSERT_OK(file->Append(buf)); + data.append(std::string(4096, 'T')); + } + + std::vector<Env::FileAttributes> file_attrs; + ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs)); + for (int i = 0; i < kNumChildren; ++i) { + const std::string name = "testfile_" + std::to_string(i); + const std::string path = test_base_dir + "/" + name; + + auto file_attrs_iter = std::find_if( + file_attrs.begin(), file_attrs.end(), + [&name](const Env::FileAttributes& fm) { return fm.name == name; }); + ASSERT_TRUE(file_attrs_iter != file_attrs.end()); + uint64_t size; + ASSERT_OK(env_->GetFileSize(path, &size)); + ASSERT_EQ(size, 4096 * i); + ASSERT_EQ(size, file_attrs_iter->size_bytes); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); +} + +// Test that all WritableFileWrapper forwards all calls to WritableFile. +TEST_P(EnvPosixTestWithParam, WritableFileWrapper) { + class Base : public WritableFile { + public: + mutable int* step_; + + void inc(int x) const { EXPECT_EQ(x, (*step_)++); } + + explicit Base(int* step) : step_(step) { inc(0); } + + Status Append(const Slice& /*data*/) override { + inc(1); + return Status::OK(); + } + + Status Append( + const Slice& /*data*/, + const DataVerificationInfo& /* verification_info */) override { + inc(1); + return Status::OK(); + } + + Status PositionedAppend(const Slice& /*data*/, + uint64_t /*offset*/) override { + inc(2); + return Status::OK(); + } + + Status PositionedAppend( + const Slice& /*data*/, uint64_t /*offset*/, + const DataVerificationInfo& /* verification_info */) override { + inc(2); + return Status::OK(); + } + + Status Truncate(uint64_t /*size*/) override { + inc(3); + return Status::OK(); + } + + Status Close() override { + inc(4); + return Status::OK(); + } + + Status Flush() override { + inc(5); + return Status::OK(); + } + + Status Sync() override { + inc(6); + return Status::OK(); + } + + Status Fsync() override { + inc(7); + return Status::OK(); + } + + bool IsSyncThreadSafe() const override { + inc(8); + return true; + } + + bool use_direct_io() const override { + inc(9); + return true; + } + + size_t GetRequiredBufferAlignment() const override { + inc(10); + return 0; + } + + void SetIOPriority(Env::IOPriority /*pri*/) override { inc(11); } + + Env::IOPriority GetIOPriority() override { + inc(12); + return Env::IOPriority::IO_LOW; + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint /*hint*/) override { + inc(13); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + inc(14); + return Env::WriteLifeTimeHint::WLTH_NOT_SET; + } + + uint64_t GetFileSize() override { + inc(15); + return 0; + } + + void SetPreallocationBlockSize(size_t /*size*/) override { inc(16); } + + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override { + inc(17); + } + + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + inc(18); + return 0; + } + + Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override { + inc(19); + return Status::OK(); + } + + Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override { + inc(20); + return Status::OK(); + } + + void PrepareWrite(size_t /*offset*/, size_t /*len*/) override { inc(21); } + + Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override { + inc(22); + return Status::OK(); + } + + public: + ~Base() override { inc(23); } + }; + + class Wrapper : public WritableFileWrapper { + public: + explicit Wrapper(WritableFile* target) : WritableFileWrapper(target) {} + }; + + int step = 0; + + { + Base b(&step); + Wrapper w(&b); + ASSERT_OK(w.Append(Slice())); + ASSERT_OK(w.PositionedAppend(Slice(), 0)); + ASSERT_OK(w.Truncate(0)); + ASSERT_OK(w.Close()); + ASSERT_OK(w.Flush()); + ASSERT_OK(w.Sync()); + ASSERT_OK(w.Fsync()); + w.IsSyncThreadSafe(); + w.use_direct_io(); + w.GetRequiredBufferAlignment(); + w.SetIOPriority(Env::IOPriority::IO_HIGH); + w.GetIOPriority(); + w.SetWriteLifeTimeHint(Env::WriteLifeTimeHint::WLTH_NOT_SET); + w.GetWriteLifeTimeHint(); + w.GetFileSize(); + w.SetPreallocationBlockSize(0); + w.GetPreallocationStatus(nullptr, nullptr); + w.GetUniqueId(nullptr, 0); + ASSERT_OK(w.InvalidateCache(0, 0)); + ASSERT_OK(w.RangeSync(0, 0)); + w.PrepareWrite(0, 0); + ASSERT_OK(w.Allocate(0, 0)); + } + + EXPECT_EQ(24, step); +} + +TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) { + const std::string path = test::PerThreadDBPath(env_, "random_rw_file"); + + env_->DeleteFile(path).PermitUncheckedError(); + + std::unique_ptr<RandomRWFile> file; + + // Cannot open non-existing file. + ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions())); + + // Create the file using WritableFile + { + std::unique_ptr<WritableFile> wf; + ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions())); + } + + ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions())); + + char buf[10000]; + Slice read_res; + + ASSERT_OK(file->Write(0, "ABCD")); + ASSERT_OK(file->Read(0, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ABCD"); + + ASSERT_OK(file->Write(2, "XXXX")); + ASSERT_OK(file->Read(0, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ABXXXX"); + + ASSERT_OK(file->Write(10, "ZZZ")); + ASSERT_OK(file->Read(10, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ZZZ"); + + ASSERT_OK(file->Write(11, "Y")); + ASSERT_OK(file->Read(10, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ZYZ"); + + ASSERT_OK(file->Write(200, "FFFFF")); + ASSERT_OK(file->Read(200, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "FFFFF"); + + ASSERT_OK(file->Write(205, "XXXX")); + ASSERT_OK(file->Read(200, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "FFFFFXXXX"); + + ASSERT_OK(file->Write(5, "QQQQ")); + ASSERT_OK(file->Read(0, 9, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ABXXXQQQQ"); + + ASSERT_OK(file->Read(2, 4, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "XXXQ"); + + // Close file and reopen it + ASSERT_OK(file->Close()); + ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions())); + + ASSERT_OK(file->Read(0, 9, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ABXXXQQQQ"); + + ASSERT_OK(file->Read(10, 3, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ZYZ"); + + ASSERT_OK(file->Read(200, 9, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "FFFFFXXXX"); + + ASSERT_OK(file->Write(4, "TTTTTTTTTTTTTTTT")); + ASSERT_OK(file->Read(0, 10, &read_res, buf)); + ASSERT_EQ(read_res.ToString(), "ABXXTTTTTT"); + + // Clean up + ASSERT_OK(env_->DeleteFile(path)); +} + +class RandomRWFileWithMirrorString { + public: + explicit RandomRWFileWithMirrorString(RandomRWFile* _file) : file_(_file) {} + + void Write(size_t offset, const std::string& data) { + // Write to mirror string + StringWrite(offset, data); + + // Write to file + Status s = file_->Write(offset, data); + ASSERT_OK(s) << s.ToString(); + } + + void Read(size_t offset = 0, size_t n = 1000000) { + Slice str_res(nullptr, 0); + if (offset < file_mirror_.size()) { + size_t str_res_sz = std::min(file_mirror_.size() - offset, n); + str_res = Slice(file_mirror_.data() + offset, str_res_sz); + StopSliceAtNull(&str_res); + } + + Slice file_res; + Status s = file_->Read(offset, n, &file_res, buf_); + ASSERT_OK(s) << s.ToString(); + StopSliceAtNull(&file_res); + + ASSERT_EQ(str_res.ToString(), file_res.ToString()) << offset << " " << n; + } + + void SetFile(RandomRWFile* _file) { file_ = _file; } + + private: + void StringWrite(size_t offset, const std::string& src) { + if (offset + src.size() > file_mirror_.size()) { + file_mirror_.resize(offset + src.size(), '\0'); + } + + char* pos = const_cast<char*>(file_mirror_.data() + offset); + memcpy(pos, src.data(), src.size()); + } + + void StopSliceAtNull(Slice* slc) { + for (size_t i = 0; i < slc->size(); i++) { + if ((*slc)[i] == '\0') { + *slc = Slice(slc->data(), i); + break; + } + } + } + + char buf_[10000]; + RandomRWFile* file_; + std::string file_mirror_; +}; + +TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) { + const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand"); + env_->DeleteFile(path).PermitUncheckedError(); + + std::unique_ptr<RandomRWFile> file; + +#ifdef OS_LINUX + // Cannot open non-existing file. + ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions())); +#endif + + // Create the file using WritableFile + { + std::unique_ptr<WritableFile> wf; + ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions())); + } + + ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions())); + RandomRWFileWithMirrorString file_with_mirror(file.get()); + + Random rnd(301); + std::string buf; + for (int i = 0; i < 10000; i++) { + // Genrate random data + buf = rnd.RandomString(10); + + // Pick random offset for write + size_t write_off = rnd.Next() % 1000; + file_with_mirror.Write(write_off, buf); + + // Pick random offset for read + size_t read_off = rnd.Next() % 1000; + size_t read_sz = rnd.Next() % 20; + file_with_mirror.Read(read_off, read_sz); + + if (i % 500 == 0) { + // Reopen the file every 500 iters + ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions())); + file_with_mirror.SetFile(file.get()); + } + } + + // clean up + ASSERT_OK(env_->DeleteFile(path)); +} + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv() : EnvWrapper(Env::Default()), close_count(0) {} + const char* Name() const override { return "TestEnv"; } + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + Status s = CloseHelper(); + s.PermitUncheckedError(); + } + } + void Logv(const char* /*format*/, va_list /*ap*/) override {} + + protected: + Status CloseImpl() override { return CloseHelper(); } + + private: + Status CloseHelper() { + env->CloseCountInc(); + return Status::OK(); + } + TestEnv* env; + }; + + void CloseCountInc() { close_count++; } + + int GetCloseCount() { return close_count; } + + Status NewLogger(const std::string& /*fname*/, + std::shared_ptr<Logger>* result) override { + result->reset(new TestLogger(this)); + return Status::OK(); + } + + private: + int close_count; +}; + +class EnvTest : public testing::Test { + public: + EnvTest() : test_directory_(test::PerThreadDBPath("env_test")) {} + + protected: + const std::string test_directory_; +}; + +TEST_F(EnvTest, Close) { + TestEnv* env = new TestEnv(); + std::shared_ptr<Logger> logger; + Status s; + + s = env->NewLogger("", &logger); + ASSERT_OK(s); + ASSERT_OK(logger.get()->Close()); + ASSERT_EQ(env->GetCloseCount(), 1); + // Call Close() again. CloseHelper() should not be called again + ASSERT_OK(logger.get()->Close()); + ASSERT_EQ(env->GetCloseCount(), 1); + logger.reset(); + ASSERT_EQ(env->GetCloseCount(), 1); + + s = env->NewLogger("", &logger); + ASSERT_OK(s); + logger.reset(); + ASSERT_EQ(env->GetCloseCount(), 2); + + delete env; +} + +class LogvWithInfoLogLevelLogger : public Logger { + public: + using Logger::Logv; + void Logv(const InfoLogLevel /* log_level */, const char* /* format */, + va_list /* ap */) override {} +}; + +TEST_F(EnvTest, LogvWithInfoLogLevel) { + // Verifies the log functions work on a `Logger` that only overrides the + // `Logv()` overload including `InfoLogLevel`. + const std::string kSampleMessage("sample log message"); + LogvWithInfoLogLevelLogger logger; + ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str()); +} + +INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam, + ::testing::Values(std::pair<Env*, bool>(Env::Default(), + false))); +#if !defined(ROCKSDB_LITE) +INSTANTIATE_TEST_CASE_P(DefaultEnvWithDirectIO, EnvPosixTestWithParam, + ::testing::Values(std::pair<Env*, bool>(Env::Default(), + true))); +#endif // !defined(ROCKSDB_LITE) + +#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) +static Env* GetChrootEnv() { + static std::unique_ptr<Env> chroot_env( + NewChrootEnv(Env::Default(), test::TmpDir(Env::Default()))); + return chroot_env.get(); +} +INSTANTIATE_TEST_CASE_P(ChrootEnvWithoutDirectIO, EnvPosixTestWithParam, + ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(), + false))); +INSTANTIATE_TEST_CASE_P(ChrootEnvWithDirectIO, EnvPosixTestWithParam, + ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(), + true))); +#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) + +class EnvFSTestWithParam + : public ::testing::Test, + public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> { + public: + EnvFSTestWithParam() { + bool env_non_null = std::get<0>(GetParam()); + bool env_default = std::get<1>(GetParam()); + bool fs_default = std::get<2>(GetParam()); + + env_ = env_non_null ? (env_default ? Env::Default() : nullptr) : nullptr; + fs_ = fs_default + ? FileSystem::Default() + : std::make_shared<FaultInjectionTestFS>(FileSystem::Default()); + if (env_non_null && env_default && !fs_default) { + env_ptr_ = NewCompositeEnv(fs_); + } + if (env_non_null && !env_default && fs_default) { + env_ptr_ = + std::unique_ptr<Env>(new FaultInjectionTestEnv(Env::Default())); + fs_.reset(); + } + if (env_non_null && !env_default && !fs_default) { + env_ptr_.reset(new FaultInjectionTestEnv(Env::Default())); + composite_env_ptr_.reset(new CompositeEnvWrapper(env_ptr_.get(), fs_)); + env_ = composite_env_ptr_.get(); + } else { + env_ = env_ptr_.get(); + } + + dbname1_ = test::PerThreadDBPath("env_fs_test1"); + dbname2_ = test::PerThreadDBPath("env_fs_test2"); + } + + ~EnvFSTestWithParam() = default; + + Env* env_; + std::unique_ptr<Env> env_ptr_; + std::unique_ptr<Env> composite_env_ptr_; + std::shared_ptr<FileSystem> fs_; + std::string dbname1_; + std::string dbname2_; +}; + +TEST_P(EnvFSTestWithParam, OptionsTest) { + Options opts; + opts.env = env_; + opts.create_if_missing = true; + std::string dbname = dbname1_; + + if (env_) { + if (fs_) { + ASSERT_EQ(fs_.get(), env_->GetFileSystem().get()); + } else { + ASSERT_NE(FileSystem::Default().get(), env_->GetFileSystem().get()); + } + } + for (int i = 0; i < 2; ++i) { + DB* db; + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + + WriteOptions wo; + ASSERT_OK(db->Put(wo, "a", "a")); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK(db->Put(wo, "b", "b")); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string val; + ASSERT_OK(db->Get(ReadOptions(), "a", &val)); + ASSERT_EQ("a", val); + ASSERT_OK(db->Get(ReadOptions(), "b", &val)); + ASSERT_EQ("b", val); + + ASSERT_OK(db->Close()); + delete db; + ASSERT_OK(DestroyDB(dbname, opts)); + + dbname = dbname2_; + } +} + +// The parameters are as follows - +// 1. True means Options::env is non-null, false means null +// 2. True means use Env::Default, false means custom +// 3. True means use FileSystem::Default, false means custom +INSTANTIATE_TEST_CASE_P(EnvFSTest, EnvFSTestWithParam, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); +// This test ensures that default Env and those allocated by +// NewCompositeEnv() all share the same threadpool +TEST_F(EnvTest, MultipleCompositeEnv) { + std::shared_ptr<FaultInjectionTestFS> fs1 = + std::make_shared<FaultInjectionTestFS>(FileSystem::Default()); + std::shared_ptr<FaultInjectionTestFS> fs2 = + std::make_shared<FaultInjectionTestFS>(FileSystem::Default()); + std::unique_ptr<Env> env1 = NewCompositeEnv(fs1); + std::unique_ptr<Env> env2 = NewCompositeEnv(fs2); + Env::Default()->SetBackgroundThreads(8, Env::HIGH); + Env::Default()->SetBackgroundThreads(16, Env::LOW); + ASSERT_EQ(env1->GetBackgroundThreads(Env::LOW), 16); + ASSERT_EQ(env1->GetBackgroundThreads(Env::HIGH), 8); + ASSERT_EQ(env2->GetBackgroundThreads(Env::LOW), 16); + ASSERT_EQ(env2->GetBackgroundThreads(Env::HIGH), 8); +} + +TEST_F(EnvTest, IsDirectory) { + Status s = Env::Default()->CreateDirIfMissing(test_directory_); + ASSERT_OK(s); + const std::string test_sub_dir = test_directory_ + "sub1"; + const std::string test_file_path = test_directory_ + "file1"; + ASSERT_OK(Env::Default()->CreateDirIfMissing(test_sub_dir)); + bool is_dir = false; + ASSERT_OK(Env::Default()->IsDirectory(test_sub_dir, &is_dir)); + ASSERT_TRUE(is_dir); + { + std::unique_ptr<FSWritableFile> wfile; + s = Env::Default()->GetFileSystem()->NewWritableFile( + test_file_path, FileOptions(), &wfile, /*dbg=*/nullptr); + ASSERT_OK(s); + std::unique_ptr<WritableFileWriter> fwriter; + fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path, + FileOptions(), + SystemClock::Default().get())); + constexpr char buf[] = "test"; + s = fwriter->Append(buf); + ASSERT_OK(s); + } + ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir)); + ASSERT_FALSE(is_dir); +} + +TEST_F(EnvTest, EnvWriteVerificationTest) { + Status s = Env::Default()->CreateDirIfMissing(test_directory_); + const std::string test_file_path = test_directory_ + "file1"; + ASSERT_OK(s); + std::shared_ptr<FaultInjectionTestFS> fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs)); + std::unique_ptr<WritableFile> file; + s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions()); + ASSERT_OK(s); + + DataVerificationInfo v_info; + std::string test_data = "test"; + std::string checksum; + uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size()); + PutFixed32(&checksum, v_crc32c); + v_info.checksum = Slice(checksum); + s = file->Append(Slice(test_data), v_info); + ASSERT_OK(s); +} + +class CreateEnvTest : public testing::Test { + public: + CreateEnvTest() { + config_options_.ignore_unknown_options = false; + config_options_.ignore_unsupported_options = false; + } + ConfigOptions config_options_; +}; + +#ifndef ROCKSDB_LITE +TEST_F(CreateEnvTest, LoadCTRProvider) { + config_options_.invoke_prepare_options = false; + std::string CTR = CTREncryptionProvider::kClassName(); + std::shared_ptr<EncryptionProvider> provider; + // Test a provider with no cipher + ASSERT_OK( + EncryptionProvider::CreateFromString(config_options_, CTR, &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + ASSERT_NOK(provider->PrepareOptions(config_options_)); + ASSERT_NOK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + auto cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_EQ(cipher->get(), nullptr); + provider.reset(); + + ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, + CTR + "://test", &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + ASSERT_OK(provider->PrepareOptions(config_options_)); + ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_NE(cipher->get(), nullptr); + ASSERT_STREQ(cipher->get()->Name(), "ROT13"); + provider.reset(); + + ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "1://test", + &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + ASSERT_OK(provider->PrepareOptions(config_options_)); + ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_NE(cipher->get(), nullptr); + ASSERT_STREQ(cipher->get()->Name(), "ROT13"); + provider.reset(); + + ASSERT_OK(EncryptionProvider::CreateFromString( + config_options_, "id=" + CTR + "; cipher=ROT13", &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_NE(cipher->get(), nullptr); + ASSERT_STREQ(cipher->get()->Name(), "ROT13"); + provider.reset(); +} + +TEST_F(CreateEnvTest, LoadROT13Cipher) { + std::shared_ptr<BlockCipher> cipher; + // Test a provider with no cipher + ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &cipher)); + ASSERT_NE(cipher, nullptr); + ASSERT_STREQ(cipher->Name(), "ROT13"); +} +#endif // ROCKSDB_LITE + +TEST_F(CreateEnvTest, CreateDefaultSystemClock) { + std::shared_ptr<SystemClock> clock, copy; + ASSERT_OK(SystemClock::CreateFromString(config_options_, + SystemClock::kDefaultName(), &clock)); + ASSERT_NE(clock, nullptr); + ASSERT_EQ(clock, SystemClock::Default()); +#ifndef ROCKSDB_LITE + std::string opts_str = clock->ToString(config_options_); + std::string mismatch; + ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(clock->AreEquivalent(config_options_, copy.get(), &mismatch)); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +TEST_F(CreateEnvTest, CreateMockSystemClock) { + std::shared_ptr<SystemClock> mock, copy; + + config_options_.registry->AddLibrary("test")->AddFactory<SystemClock>( + MockSystemClock::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard, + std::string* /* errmsg */) { + guard->reset(new MockSystemClock(nullptr)); + return guard->get(); + }); + ASSERT_OK(SystemClock::CreateFromString( + config_options_, EmulatedSystemClock::kClassName(), &mock)); + ASSERT_NE(mock, nullptr); + ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName()); + ASSERT_EQ(mock->Inner(), SystemClock::Default().get()); + std::string opts_str = mock->ToString(config_options_); + std::string mismatch; + ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch)); + + std::string id = std::string("id=") + EmulatedSystemClock::kClassName() + + ";target=" + MockSystemClock::kClassName(); + + ASSERT_OK(SystemClock::CreateFromString(config_options_, id, &mock)); + ASSERT_NE(mock, nullptr); + ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName()); + ASSERT_NE(mock->Inner(), nullptr); + ASSERT_STREQ(mock->Inner()->Name(), MockSystemClock::kClassName()); + ASSERT_EQ(mock->Inner()->Inner(), SystemClock::Default().get()); + opts_str = mock->ToString(config_options_); + ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(SystemClock::CreateFromString( + config_options_, EmulatedSystemClock::kClassName(), &mock)); +} + +TEST_F(CreateEnvTest, CreateReadOnlyFileSystem) { + std::shared_ptr<FileSystem> fs, copy; + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, ReadOnlyFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + std::string("id=") + ReadOnlyFileSystem::kClassName() + + "; target=" + TimedFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + opts_str = fs->ToString(config_options_); + ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(CreateEnvTest, CreateTimedFileSystem) { + std::shared_ptr<FileSystem> fs, copy; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, + TimedFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + std::string("id=") + TimedFileSystem::kClassName() + + "; target=" + ReadOnlyFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + opts_str = fs->ToString(config_options_); + ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(CreateEnvTest, CreateCountedFileSystem) { + std::shared_ptr<FileSystem> fs, copy; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, + CountedFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), CountedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + std::string("id=") + CountedFileSystem::kClassName() + + "; target=" + ReadOnlyFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + opts_str = fs->ToString(config_options_); + ASSERT_STREQ(fs->Name(), CountedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +#ifndef OS_WIN +TEST_F(CreateEnvTest, CreateChrootFileSystem) { + std::shared_ptr<FileSystem> fs, copy; + auto tmp_dir = test::TmpDir(Env::Default()); + // The Chroot FileSystem has a required "chroot_dir" option. + ASSERT_NOK(FileSystem::CreateFromString(config_options_, + ChrootFileSystem::kClassName(), &fs)); + + // ChrootFileSystem fails with an invalid directory + ASSERT_NOK(FileSystem::CreateFromString( + config_options_, + std::string("chroot_dir=/No/Such/Directory; id=") + + ChrootFileSystem::kClassName(), + &fs)); + std::string chroot_opts = std::string("chroot_dir=") + tmp_dir + + std::string("; id=") + + ChrootFileSystem::kClassName(); + + // Create a valid ChrootFileSystem with an inner Default + ASSERT_OK(FileSystem::CreateFromString(config_options_, chroot_opts, &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + // Create a valid ChrootFileSystem with an inner TimedFileSystem + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + chroot_opts + "; target=" + TimedFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + opts_str = fs->ToString(config_options_); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + // Create a TimedFileSystem with an inner ChrootFileSystem + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + "target={" + chroot_opts + "}; id=" + TimedFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), ChrootFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + opts_str = fs->ToString(config_options_); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} +#endif // OS_WIN + +TEST_F(CreateEnvTest, CreateEncryptedFileSystem) { + std::shared_ptr<FileSystem> fs, copy; + + std::string base_opts = + std::string("provider=1://test; id=") + EncryptedFileSystem::kClassName(); + // The EncryptedFileSystem requires a "provider" option. + ASSERT_NOK(FileSystem::CreateFromString( + config_options_, EncryptedFileSystem::kClassName(), &fs)); + + ASSERT_OK(FileSystem::CreateFromString(config_options_, base_opts, &fs)); + + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(FileSystem::CreateFromString( + config_options_, base_opts + "; target=" + TimedFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + opts_str = fs->ToString(config_options_); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +#endif // ROCKSDB_LITE + +namespace { + +constexpr size_t kThreads = 8; +constexpr size_t kIdsPerThread = 1000; + +// This is a mini-stress test to check for duplicates in functions like +// GenerateUniqueId() +template <typename IdType, class Hash = std::hash<IdType>> +struct NoDuplicateMiniStressTest { + std::unordered_set<IdType, Hash> ids; + std::mutex mutex; + Env* env; + + NoDuplicateMiniStressTest() { env = Env::Default(); } + + virtual ~NoDuplicateMiniStressTest() {} + + void Run() { + std::array<std::thread, kThreads> threads; + for (size_t i = 0; i < kThreads; ++i) { + threads[i] = std::thread([&]() { ThreadFn(); }); + } + for (auto& thread : threads) { + thread.join(); + } + // All must be unique + ASSERT_EQ(ids.size(), kThreads * kIdsPerThread); + } + + void ThreadFn() { + std::array<IdType, kIdsPerThread> my_ids; + // Generate in parallel threads as fast as possible + for (size_t i = 0; i < kIdsPerThread; ++i) { + my_ids[i] = Generate(); + } + // Now collate + std::lock_guard<std::mutex> lock(mutex); + for (auto& id : my_ids) { + ids.insert(id); + } + } + + virtual IdType Generate() = 0; +}; + +void VerifyRfcUuids(const std::unordered_set<std::string>& uuids) { + if (uuids.empty()) { + return; + } +} + +using uint64_pair_t = std::pair<uint64_t, uint64_t>; +struct HashUint64Pair { + std::size_t operator()( + std::pair<uint64_t, uint64_t> const& u) const noexcept { + // Assume suitable distribution already + return static_cast<size_t>(u.first ^ u.second); + } +}; + +} // namespace + +TEST_F(EnvTest, GenerateUniqueId) { + struct MyStressTest : public NoDuplicateMiniStressTest<std::string> { + std::string Generate() override { return env->GenerateUniqueId(); } + }; + + MyStressTest t; + t.Run(); + + // Basically verify RFC-4122 format + for (auto& uuid : t.ids) { + ASSERT_EQ(36U, uuid.size()); + ASSERT_EQ('-', uuid[8]); + ASSERT_EQ('-', uuid[13]); + ASSERT_EQ('-', uuid[18]); + ASSERT_EQ('-', uuid[23]); + } +} + +TEST_F(EnvTest, GenerateDbSessionId) { + struct MyStressTest : public NoDuplicateMiniStressTest<std::string> { + std::string Generate() override { return DBImpl::GenerateDbSessionId(env); } + }; + + MyStressTest t; + t.Run(); + + // Basically verify session ID + for (auto& id : t.ids) { + ASSERT_EQ(20U, id.size()); + } +} + +constexpr bool kRequirePortGenerateRfcUuid = +#if defined(OS_LINUX) || defined(OS_ANDROID) || defined(OS_WIN) + true; +#else + false; +#endif + +TEST_F(EnvTest, PortGenerateRfcUuid) { + if (!kRequirePortGenerateRfcUuid) { + ROCKSDB_GTEST_SKIP("Not supported/expected on this platform"); + return; + } + struct MyStressTest : public NoDuplicateMiniStressTest<std::string> { + std::string Generate() override { + std::string u; + assert(port::GenerateRfcUuid(&u)); + return u; + } + }; + + MyStressTest t; + t.Run(); + + // Extra verification on versions and variants + VerifyRfcUuids(t.ids); +} + +// Test the atomic, linear generation of GenerateRawUuid +TEST_F(EnvTest, GenerateRawUniqueId) { + struct MyStressTest + : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> { + uint64_pair_t Generate() override { + uint64_pair_t p; + GenerateRawUniqueId(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +// Test that each entropy source ("track") is at least adequate +TEST_F(EnvTest, GenerateRawUniqueIdTrackPortUuidOnly) { + if (!kRequirePortGenerateRfcUuid) { + ROCKSDB_GTEST_SKIP("Not supported/expected on this platform"); + return; + } + + struct MyStressTest + : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> { + uint64_pair_t Generate() override { + uint64_pair_t p; + TEST_GenerateRawUniqueId(&p.first, &p.second, false, true, true); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, GenerateRawUniqueIdTrackEnvDetailsOnly) { + struct MyStressTest + : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> { + uint64_pair_t Generate() override { + uint64_pair_t p; + TEST_GenerateRawUniqueId(&p.first, &p.second, true, false, true); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) { + struct MyStressTest + : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> { + uint64_pair_t Generate() override { + uint64_pair_t p; + TEST_GenerateRawUniqueId(&p.first, &p.second, true, true, false); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) { + // Must be thread safe and usable as a static + static SemiStructuredUniqueIdGen gen; + + struct MyStressTest + : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> { + uint64_pair_t Generate() override { + uint64_pair_t p; + gen.GenerateNext(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, FailureToCreateLockFile) { + auto env = Env::Default(); + auto fs = env->GetFileSystem(); + std::string dir = test::PerThreadDBPath(env, "lockdir"); + std::string file = dir + "/lockfile"; + + // Ensure directory doesn't exist + ASSERT_OK(DestroyDir(env, dir)); + + // Make sure that we can acquire a file lock after the first attempt fails + FileLock* lock = nullptr; + ASSERT_NOK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr)); + ASSERT_FALSE(lock); + + ASSERT_OK(fs->CreateDir(dir, IOOptions(), /*dbg*/ nullptr)); + ASSERT_OK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr)); + ASSERT_OK(fs->UnlockFile(lock, IOOptions(), /*dbg*/ nullptr)); + + // Clean up + ASSERT_OK(DestroyDir(env, dir)); +} + +TEST_F(CreateEnvTest, CreateDefaultEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + + std::shared_ptr<Env> guard; + Env* env = nullptr; + ASSERT_OK(Env::CreateFromString(options, "", &env)); + ASSERT_EQ(env, Env::Default()); + + env = nullptr; + ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env)); + ASSERT_EQ(env, Env::Default()); + + env = nullptr; + ASSERT_OK(Env::CreateFromString(options, "", &env, &guard)); + ASSERT_EQ(env, Env::Default()); + ASSERT_EQ(guard, nullptr); + + env = nullptr; + ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env, &guard)); + ASSERT_EQ(env, Env::Default()); + ASSERT_EQ(guard, nullptr); + +#ifndef ROCKSDB_LITE + std::string opt_str = env->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env)); + ASSERT_EQ(env, Env::Default()); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); + ASSERT_EQ(env, Env::Default()); + ASSERT_EQ(guard, nullptr); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +namespace { +class WrappedEnv : public EnvWrapper { + public: + explicit WrappedEnv(Env* t) : EnvWrapper(t) {} + explicit WrappedEnv(const std::shared_ptr<Env>& t) : EnvWrapper(t) {} + static const char* kClassName() { return "WrappedEnv"; } + const char* Name() const override { return kClassName(); } + static void Register(ObjectLibrary& lib, const std::string& /*arg*/) { + lib.AddFactory<Env>( + WrappedEnv::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<Env>* guard, + std::string* /* errmsg */) { + guard->reset(new WrappedEnv(nullptr)); + return guard->get(); + }); + } +}; +} // namespace +TEST_F(CreateEnvTest, CreateMockEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + WrappedEnv::Register(*(options.registry->AddLibrary("test")), ""); + std::shared_ptr<Env> guard, copy; + std::string opt_str; + + Env* env = nullptr; + ASSERT_NOK(Env::CreateFromString(options, MockEnv::kClassName(), &env)); + ASSERT_OK( + Env::CreateFromString(options, MockEnv::kClassName(), &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + opt_str = env->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + std::string mismatch; + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + guard.reset(MockEnv::Create(Env::Default(), SystemClock::Default())); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + std::unique_ptr<Env> wrapped_env(new WrappedEnv(Env::Default())); + guard.reset(MockEnv::Create(wrapped_env.get(), SystemClock::Default())); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + opt_str = copy->ToString(options); +} + +TEST_F(CreateEnvTest, CreateWrappedEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + WrappedEnv::Register(*(options.registry->AddLibrary("test")), ""); + Env* env = nullptr; + std::shared_ptr<Env> guard, copy; + std::string opt_str; + std::string mismatch; + + ASSERT_NOK(Env::CreateFromString(options, WrappedEnv::kClassName(), &env)); + ASSERT_OK( + Env::CreateFromString(options, WrappedEnv::kClassName(), &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_FALSE(guard->AreEquivalent(options, Env::Default(), &mismatch)); + + opt_str = env->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(Env::Default()))); + ASSERT_NE(guard.get(), env); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>( + std::make_shared<WrappedEnv>(Env::Default())))); + ASSERT_NE(guard.get(), env); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); +} + +TEST_F(CreateEnvTest, CreateCompositeEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + std::shared_ptr<Env> guard, copy; + Env* env = nullptr; + std::string mismatch, opt_str; + + WrappedEnv::Register(*(options.registry->AddLibrary("test")), ""); + std::unique_ptr<Env> base(NewCompositeEnv(FileSystem::Default())); + std::unique_ptr<Env> wrapped(new WrappedEnv(Env::Default())); + std::shared_ptr<FileSystem> timed_fs = + std::make_shared<TimedFileSystem>(FileSystem::Default()); + std::shared_ptr<SystemClock> clock = + std::make_shared<EmulatedSystemClock>(SystemClock::Default()); + + opt_str = base->ToString(options); + ASSERT_NOK(Env::CreateFromString(options, opt_str, &env)); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_EQ(env->GetFileSystem(), FileSystem::Default()); + ASSERT_EQ(env->GetSystemClock(), SystemClock::Default()); + + base = NewCompositeEnv(timed_fs); + opt_str = base->ToString(options); + ASSERT_NOK(Env::CreateFromString(options, opt_str, &env)); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_NE(env->GetFileSystem(), FileSystem::Default()); + ASSERT_EQ(env->GetSystemClock(), SystemClock::Default()); + + env = nullptr; + guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs)); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + env = nullptr; + guard.reset(new CompositeEnvWrapper(wrapped.get(), clock)); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + env = nullptr; + guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs, clock)); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + guard.reset(new CompositeEnvWrapper(nullptr, timed_fs, clock)); + ColumnFamilyOptions cf_opts; + DBOptions db_opts; + db_opts.env = guard.get(); + auto comp = db_opts.env->CheckedCast<CompositeEnvWrapper>(); + ASSERT_NE(comp, nullptr); + ASSERT_EQ(comp->Inner(), nullptr); + ASSERT_NOK(ValidateOptions(db_opts, cf_opts)); + ASSERT_OK(db_opts.env->PrepareOptions(options)); + ASSERT_NE(comp->Inner(), nullptr); + ASSERT_OK(ValidateOptions(db_opts, cf_opts)); +} +#endif // ROCKSDB_LITE + +// Forward declaration +class ReadAsyncFS; + +struct MockIOHandle { + std::function<void(const FSReadRequest&, void*)> cb; + void* cb_arg; + bool create_io_error; +}; + +// ReadAsyncFS and ReadAsyncRandomAccessFile mocks the FS doing asynchronous +// reads by creating threads that submit read requests and then calling Poll API +// to obtain those results. +class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + ReadAsyncRandomAccessFile(ReadAsyncFS& fs, + std::unique_ptr<FSRandomAccessFile>& file) + : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} + + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function<void(const FSReadRequest&, void*)> cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override; + + private: + ReadAsyncFS& fs_; + std::unique_ptr<FSRandomAccessFile> file_; + int counter = 0; +}; + +class ReadAsyncFS : public FileSystemWrapper { + public: + explicit ReadAsyncFS(const std::shared_ptr<FileSystem>& wrapped) + : FileSystemWrapper(wrapped) {} + + static const char* kClassName() { return "ReadAsyncFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) override { + std::unique_ptr<FSRandomAccessFile> file; + IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + EXPECT_OK(s); + result->reset(new ReadAsyncRandomAccessFile(*this, file)); + return s; + } + + IOStatus Poll(std::vector<void*>& io_handles, + size_t /*min_completions*/) override { + // Wait for the threads completion. + for (auto& t : workers) { + t.join(); + } + + for (size_t i = 0; i < io_handles.size(); i++) { + MockIOHandle* handle = static_cast<MockIOHandle*>(io_handles[i]); + if (handle->create_io_error) { + FSReadRequest req; + req.status = IOStatus::IOError(); + handle->cb(req, handle->cb_arg); + } + } + return IOStatus::OK(); + } + + std::vector<std::thread> workers; +}; + +IOStatus ReadAsyncRandomAccessFile::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { + IOHandleDeleter deletefn = [](void* args) -> void { + delete (static_cast<MockIOHandle*>(args)); + args = nullptr; + }; + *del_fn = deletefn; + + // Allocate and populate io_handle. + MockIOHandle* mock_handle = new MockIOHandle(); + bool create_io_error = false; + if (counter % 2) { + create_io_error = true; + } + mock_handle->create_io_error = create_io_error; + mock_handle->cb = cb; + mock_handle->cb_arg = cb_arg; + *io_handle = static_cast<void*>(mock_handle); + counter++; + + // Submit read request asynchronously. + std::function<void(FSReadRequest)> submit_request = + [&opts, cb, cb_arg, dbg, create_io_error, this](FSReadRequest _req) { + if (!create_io_error) { + _req.status = target()->Read(_req.offset, _req.len, opts, + &(_req.result), _req.scratch, dbg); + cb(_req, cb_arg); + } + }; + + fs_.workers.emplace_back(submit_request, req); + return IOStatus::OK(); +} + +class TestAsyncRead : public testing::Test { + public: + TestAsyncRead() { env_ = Env::Default(); } + Env* env_; +}; + +// Tests the default implementation of ReadAsync API. +TEST_F(TestAsyncRead, ReadAsync) { + EnvOptions soptions; + std::shared_ptr<ReadAsyncFS> fs = + std::make_shared<ReadAsyncFS>(env_->GetFileSystem()); + + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kSectorSize = 4096; + const size_t kNumSectors = 8; + + // 1. create & write to a file. + { + std::unique_ptr<FSWritableFile> wfile; + ASSERT_OK( + fs->NewWritableFile(fname, FileOptions(), &wfile, nullptr /*dbg*/)); + + for (size_t i = 0; i < kNumSectors; ++i) { + auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1)); + Slice slice(data.get(), kSectorSize); + ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr)); + } + ASSERT_OK(wfile->Close(IOOptions(), nullptr)); + } + // 2. Read file + { + std::unique_ptr<FSRandomAccessFile> file; + ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr)); + + IOOptions opts; + std::vector<void*> io_handles(kNumSectors); + std::vector<FSReadRequest> reqs(kNumSectors); + std::vector<std::unique_ptr<char, Deleter>> data; + std::vector<size_t> vals; + IOHandleDeleter del_fn; + uint64_t offset = 0; + + // Initialize read requests + for (size_t i = 0; i < kNumSectors; i++) { + reqs[i].offset = offset; + reqs[i].len = kSectorSize; + data.emplace_back(NewAligned(kSectorSize, 0)); + reqs[i].scratch = data.back().get(); + vals.push_back(i); + offset += kSectorSize; + } + + // callback function passed to async read. + std::function<void(const FSReadRequest&, void*)> callback = + [&](const FSReadRequest& req, void* cb_arg) { + assert(cb_arg != nullptr); + size_t i = *(reinterpret_cast<size_t*>(cb_arg)); + reqs[i].offset = req.offset; + reqs[i].result = req.result; + reqs[i].status = req.status; + }; + + // Submit asynchronous read requests. + for (size_t i = 0; i < kNumSectors; i++) { + void* cb_arg = static_cast<void*>(&(vals[i])); + ASSERT_OK(file->ReadAsync(reqs[i], opts, callback, cb_arg, + &(io_handles[i]), &del_fn, nullptr)); + } + + // Poll for the submitted requests. + fs->Poll(io_handles, kNumSectors); + + // Check the status of read requests. + for (size_t i = 0; i < kNumSectors; i++) { + if (i % 2) { + ASSERT_EQ(reqs[i].status, IOStatus::IOError()); + } else { + auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1)); + Slice expected_data(buf.get(), kSectorSize); + + ASSERT_EQ(reqs[i].offset, i * kSectorSize); + ASSERT_OK(reqs[i].status); + ASSERT_EQ(expected_data.ToString(), reqs[i].result.ToString()); + } + } + + // Delete io_handles. + for (size_t i = 0; i < io_handles.size(); i++) { + del_fn(io_handles[i]); + } + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/env/file_system.cc b/src/rocksdb/env/file_system.cc new file mode 100644 index 000000000..f9dda429a --- /dev/null +++ b/src/rocksdb/env/file_system.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "rocksdb/file_system.h" + +#include "env/composite_env_wrapper.h" +#include "env/env_chroot.h" +#include "env/env_encryption_ctr.h" +#include "env/fs_readonly.h" +#include "env/mock_env.h" +#include "logging/env_logger.h" +#include "options/db_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" +#include "utilities/counted_fs.h" +#include "utilities/env_timed.h" + +namespace ROCKSDB_NAMESPACE { + +FileSystem::FileSystem() {} + +FileSystem::~FileSystem() {} + +Status FileSystem::Load(const std::string& value, + std::shared_ptr<FileSystem>* result) { + return CreateFromString(ConfigOptions(), value, result); +} + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinFileSystems(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory<FileSystem>( + TimedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard, + std::string* /* errmsg */) { + guard->reset(new TimedFileSystem(nullptr)); + return guard->get(); + }); + library.AddFactory<FileSystem>( + ReadOnlyFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard, + std::string* /* errmsg */) { + guard->reset(new ReadOnlyFileSystem(nullptr)); + return guard->get(); + }); + library.AddFactory<FileSystem>( + EncryptedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard, + std::string* errmsg) { + Status s = NewEncryptedFileSystemImpl(nullptr, nullptr, guard); + if (!s.ok()) { + *errmsg = s.ToString(); + } + return guard->get(); + }); + library.AddFactory<FileSystem>( + CountedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard, + std::string* /*errmsg*/) { + guard->reset(new CountedFileSystem(FileSystem::Default())); + return guard->get(); + }); + library.AddFactory<FileSystem>( + MockFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard, + std::string* /*errmsg*/) { + guard->reset(new MockFileSystem(SystemClock::Default())); + return guard->get(); + }); +#ifndef OS_WIN + library.AddFactory<FileSystem>( + ChrootFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard, + std::string* /* errmsg */) { + guard->reset(new ChrootFileSystem(nullptr, "")); + return guard->get(); + }); +#endif // OS_WIN + size_t num_types; + return static_cast<int>(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +Status FileSystem::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr<FileSystem>* result) { + auto default_fs = FileSystem::Default(); + if (default_fs->IsInstanceOf(value)) { + *result = default_fs; + return Status::OK(); + } else { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject<FileSystem>(config_options, value, nullptr, result); + } +} + +IOStatus FileSystem::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) { + IOStatus s = RenameFile(old_fname, fname, opts.io_options, dbg); + if (!s.ok()) { + return s; + } + return NewWritableFile(fname, opts, result, dbg); +} + +IOStatus FileSystem::NewLogger(const std::string& fname, + const IOOptions& io_opts, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) { + FileOptions options; + options.io_options = io_opts; + // TODO: Tune the buffer size. + options.writable_file_max_buffer_size = 1024 * 1024; + std::unique_ptr<FSWritableFile> writable_file; + const IOStatus status = NewWritableFile(fname, options, &writable_file, dbg); + if (!status.ok()) { + return status; + } + + *result = std::make_shared<EnvLogger>(std::move(writable_file), fname, + options, Env::Default()); + return IOStatus::OK(); +} + +FileOptions FileSystem::OptimizeForLogRead( + const FileOptions& file_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = false; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForManifestRead( + const FileOptions& file_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = false; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized_file_options.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForManifestWrite( + const FileOptions& file_options) const { + return file_options; +} + +FileOptions FileSystem::OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_writes = + db_options.use_direct_io_for_flush_and_compaction; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + +FileOptions FileSystem::OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + +IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, bool should_sync) { + std::unique_ptr<FSWritableFile> file; + EnvOptions soptions; + IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); + if (!s.ok()) { + return s; + } + s = file->Append(data, IOOptions(), nullptr); + if (s.ok() && should_sync) { + s = file->Sync(IOOptions(), nullptr); + } + if (!s.ok()) { + fs->DeleteFile(fname, IOOptions(), nullptr); + } + return s; +} + +IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data) { + FileOptions soptions; + data->clear(); + std::unique_ptr<FSSequentialFile> file; + IOStatus s = status_to_io_status( + fs->NewSequentialFile(fname, soptions, &file, nullptr)); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, IOOptions(), &fragment, space, nullptr); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + return s; +} + +namespace { +static std::unordered_map<std::string, OptionTypeInfo> fs_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + OptionTypeInfo::AsCustomSharedPtr<FileSystem>( + 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, +#endif // ROCKSDB_LITE +}; +} // namespace +FileSystemWrapper::FileSystemWrapper(const std::shared_ptr<FileSystem>& t) + : target_(t) { + RegisterOptions("", &target_, &fs_wrapper_type_info); +} + +Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) { + if (target_ == nullptr) { + target_ = FileSystem::Default(); + } + return FileSystem::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string FileSystemWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto parent = FileSystem::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_ == nullptr || + target_->IsInstanceOf(FileSystem::kDefaultName())) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_->ToString(config_options)); + return result; + } +} +#endif // ROCKSDB_LITE + +DirFsyncOptions::DirFsyncOptions() { reason = kDefault; } + +DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) { + reason = kFileRenamed; + renamed_new_name = file_renamed_new_name; +} + +DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) { + assert(fsync_reason != kFileRenamed); + reason = fsync_reason; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/file_system_tracer.cc b/src/rocksdb/env/file_system_tracer.cc new file mode 100644 index 000000000..d0c45c57e --- /dev/null +++ b/src/rocksdb/env/file_system_tracer.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "env/file_system_tracer.h" + +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus FileSystemTracingWrapper::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& file_opts, std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomRWFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewDirectory( + const std::string& name, const IOOptions& io_opts, + std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewDirectory(name, io_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + name.substr(name.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir, + const IOOptions& io_opts, + std::vector<std::string>* r, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->GetChildren(dir, io_opts, r, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dir.substr(dir.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->DeleteFile(fname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->CreateDir(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::CreateDirIfMissing( + const std::string& dirname, const IOOptions& options, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->DeleteDir(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->GetFileSize(fname, options, file_size, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed, + s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname, + size_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(fname, size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1), size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::Read(size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + offset); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::PositionedRead( + uint64_t offset, size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs, + size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t latency = elapsed; + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + for (size_t i = 0; i < num_reqs; i++) { + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency, + reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset); + io_tracer_->WriteIOOp(io_record, dbg); + } + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Prefetch(offset, n, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + static_cast<uint64_t>(offset)); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { + // Create a callback and populate info. + auto read_async_callback = + std::bind(&FSRandomAccessFileTracingWrapper::ReadAsyncCallback, this, + std::placeholders::_1, std::placeholders::_2); + ReadAsyncCallbackInfo* read_async_cb_info = new ReadAsyncCallbackInfo; + read_async_cb_info->cb_ = cb; + read_async_cb_info->cb_arg_ = cb_arg; + read_async_cb_info->start_time_ = clock_->NowNanos(); + read_async_cb_info->file_op_ = __func__; + + IOStatus s = target()->ReadAsync(req, opts, read_async_callback, + read_async_cb_info, io_handle, del_fn, dbg); + + if (!s.ok()) { + delete read_async_cb_info; + } + return s; +} + +void FSRandomAccessFileTracingWrapper::ReadAsyncCallback( + const FSReadRequest& req, void* cb_arg) { + ReadAsyncCallbackInfo* read_async_cb_info = + static_cast<ReadAsyncCallbackInfo*>(cb_arg); + assert(read_async_cb_info); + assert(read_async_cb_info->cb_); + + uint64_t elapsed = clock_->NowNanos() - read_async_cb_info->start_time_; + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + read_async_cb_info->file_op_, elapsed, + req.status.ToString(), file_name_, req.result.size(), + req.offset); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + + // call the underlying callback. + read_async_cb_info->cb_(req, read_async_cb_info->cb_arg_); + delete read_async_cb_info; +} + +IOStatus FSWritableFileTracingWrapper::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Append(data, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->PositionedAppend(data, offset, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, size, + 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + uint64_t file_size = target()->GetFileSize(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, "OK", file_name_, file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return file_size; +} + +IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + static_cast<uint64_t>(offset)); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Write(offset, data, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Flush(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Sync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Fsync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/file_system_tracer.h b/src/rocksdb/env/file_system_tracer.h new file mode 100644 index 000000000..979a0bf12 --- /dev/null +++ b/src/rocksdb/env/file_system_tracer.h @@ -0,0 +1,461 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "trace_replay/io_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +// FileSystemTracingWrapper is a wrapper class above FileSystem that forwards +// the call to the underlying storage system. It then invokes IOTracer to record +// file operations and other contextual information in a binary format for +// tracing. It overrides methods we are interested in tracing and extends +// FileSystemWrapper, which forwards all methods that are not explicitly +// overridden. +class FileSystemTracingWrapper : public FileSystemWrapper { + public: + FileSystemTracingWrapper(const std::shared_ptr<FileSystem>& t, + const std::shared_ptr<IOTracer>& io_tracer) + : FileSystemWrapper(t), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()) {} + + ~FileSystemTracingWrapper() override {} + + static const char* kClassName() { return "FileSystemTracing"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector<std::string>* r, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + + private: + std::shared_ptr<IOTracer> io_tracer_; + SystemClock* clock_; +}; + +// The FileSystemPtr is a wrapper class that takes pointer to storage systems +// (such as posix filesystems). It overloads operator -> and returns a pointer +// of either FileSystem or FileSystemTracingWrapper based on whether tracing is +// enabled or not. It is added to bypass FileSystemTracingWrapper when tracing +// is disabled. +class FileSystemPtr { + public: + FileSystemPtr(std::shared_ptr<FileSystem> fs, + const std::shared_ptr<IOTracer>& io_tracer) + : fs_(fs), io_tracer_(io_tracer) { + fs_tracer_ = std::make_shared<FileSystemTracingWrapper>(fs_, io_tracer_); + } + + std::shared_ptr<FileSystem> operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_; + } else { + return fs_; + } + } + + /* Returns the underlying File System pointer */ + FileSystem* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else { + return fs_.get(); + } + } + + private: + std::shared_ptr<FileSystem> fs_; + std::shared_ptr<IOTracer> io_tracer_; + std::shared_ptr<FileSystemTracingWrapper> fs_tracer_; +}; + +// FSSequentialFileTracingWrapper is a wrapper class above FSSequentialFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSSequentialFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSSequentialFileTracingWrapper : public FSSequentialFileOwnerWrapper { + public: + FSSequentialFileTracingWrapper(std::unique_ptr<FSSequentialFile>&& t, + std::shared_ptr<IOTracer> io_tracer, + const std::string& file_name) + : FSSequentialFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSSequentialFileTracingWrapper() override {} + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + private: + std::shared_ptr<IOTracer> io_tracer_; + SystemClock* clock_; + std::string file_name_; +}; + +// The FSSequentialFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSSequentialFile or FSSequentialFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSSequentialFileTracingWrapper when tracing is disabled. +class FSSequentialFilePtr { + public: + FSSequentialFilePtr() = delete; + FSSequentialFilePtr(std::unique_ptr<FSSequentialFile>&& fs, + const std::shared_ptr<IOTracer>& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSSequentialFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSSequentialFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr<IOTracer> io_tracer_; + FSSequentialFileTracingWrapper fs_tracer_; +}; + +// FSRandomAccessFileTracingWrapper is a wrapper class above FSRandomAccessFile +// that forwards the call to the underlying storage system. It then invokes +// IOTracer to record file operations and other contextual information in a +// binary format for tracing. It overrides methods we are interested in tracing +// and extends FSRandomAccessFileWrapper, which forwards all methods that are +// not explicitly overridden. +class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper { + public: + FSRandomAccessFileTracingWrapper(std::unique_ptr<FSRandomAccessFile>&& t, + std::shared_ptr<IOTracer> io_tracer, + const std::string& file_name) + : FSRandomAccessFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSRandomAccessFileTracingWrapper() override {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function<void(const FSReadRequest&, void*)> cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override; + + void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg); + + private: + std::shared_ptr<IOTracer> io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; + + struct ReadAsyncCallbackInfo { + uint64_t start_time_; + std::function<void(const FSReadRequest&, void*)> cb_; + void* cb_arg_; + std::string file_op_; + }; +}; + +// The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSRandomAccessFile or FSRandomAccessFileTracingWrapper +// based on whether tracing is enabled or not. It is added to bypass +// FSRandomAccessFileTracingWrapper when tracing is disabled. +class FSRandomAccessFilePtr { + public: + FSRandomAccessFilePtr(std::unique_ptr<FSRandomAccessFile>&& fs, + const std::shared_ptr<IOTracer>& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSRandomAccessFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSRandomAccessFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr<IOTracer> io_tracer_; + FSRandomAccessFileTracingWrapper fs_tracer_; +}; + +// FSWritableFileTracingWrapper is a wrapper class above FSWritableFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSWritableFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSWritableFileTracingWrapper : public FSWritableFileOwnerWrapper { + public: + FSWritableFileTracingWrapper(std::unique_ptr<FSWritableFile>&& t, + std::shared_ptr<IOTracer> io_tracer, + const std::string& file_name) + : FSWritableFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSWritableFileTracingWrapper() override {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* dbg) override { + return Append(data, options, dbg); + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, options, dbg); + } + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + private: + std::shared_ptr<IOTracer> io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSWritableFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSWritableFile or FSWritableFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSWritableFileTracingWrapper when tracing is disabled. +class FSWritableFilePtr { + public: + FSWritableFilePtr(std::unique_ptr<FSWritableFile>&& fs, + const std::shared_ptr<IOTracer>& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer) { + fs_tracer_.reset(new FSWritableFileTracingWrapper( + std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */)); + } + + FSWritableFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else { + return fs_tracer_->target(); + } + } + + FSWritableFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else if (fs_tracer_) { + return fs_tracer_->target(); + } else { + return nullptr; + } + } + + void reset() { + fs_tracer_.reset(); + io_tracer_ = nullptr; + } + + private: + std::shared_ptr<IOTracer> io_tracer_; + std::unique_ptr<FSWritableFileTracingWrapper> fs_tracer_; +}; + +// FSRandomRWFileTracingWrapper is a wrapper class above FSRandomRWFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSRandomRWFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSRandomRWFileTracingWrapper : public FSRandomRWFileOwnerWrapper { + public: + FSRandomRWFileTracingWrapper(std::unique_ptr<FSRandomRWFile>&& t, + std::shared_ptr<IOTracer> io_tracer, + const std::string& file_name) + : FSRandomRWFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSRandomRWFileTracingWrapper() override {} + + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + private: + std::shared_ptr<IOTracer> io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSRandomRWFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSRandomRWFile or FSRandomRWFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSRandomRWFileTracingWrapper when tracing is disabled. +class FSRandomRWFilePtr { + public: + FSRandomRWFilePtr(std::unique_ptr<FSRandomRWFile>&& fs, + std::shared_ptr<IOTracer> io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSRandomRWFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSRandomRWFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr<IOTracer> io_tracer_; + FSRandomRWFileTracingWrapper fs_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/fs_posix.cc b/src/rocksdb/env/fs_posix.cc new file mode 100644 index 000000000..e179a421d --- /dev/null +++ b/src/rocksdb/env/fs_posix.cc @@ -0,0 +1,1294 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + +#if !defined(OS_WIN) + +#include <dirent.h> +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include <dlfcn.h> +#endif +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) +#include <sys/statfs.h> +#include <sys/sysmacros.h> +#endif +#include <sys/statvfs.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> + +#include <algorithm> +// Get nano time includes +#if defined(OS_LINUX) || defined(OS_FREEBSD) +#elif defined(__MACH__) +#include <Availability.h> +#include <mach/clock.h> +#include <mach/mach.h> +#else +#include <chrono> +#endif +#include <deque> +#include <set> +#include <vector> + +#include "env/composite_env_wrapper.h" +#include "env/io_posix.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/thread_local.h" +#include "util/threadpool_imp.h" + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__)); + +namespace ROCKSDB_NAMESPACE { + +namespace { + +inline mode_t GetDBFileMode(bool allow_non_owner_access) { + return allow_non_owner_access ? 0644 : 0600; +} + +// list of pathnames that are locked +// Only used for error message. +struct LockHoldingInfo { + int64_t acquire_time; + uint64_t acquiring_thread; +}; +static std::map<std::string, LockHoldingInfo> locked_files; +static port::Mutex mutex_locked_files; + +static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + + return value; +} + +class PosixFileLock : public FileLock { + public: + int fd_ = /*invalid*/ -1; + std::string filename; + + void Clear() { + fd_ = -1; + filename.clear(); + } + + virtual ~PosixFileLock() override { + // Check for destruction without UnlockFile + assert(fd_ == -1); + } +}; + +int cloexec_flags(int flags, const EnvOptions* options) { + // If the system supports opening the file with cloexec enabled, + // do so, as this avoids a race condition if a db is opened around + // the same time that a child process is forked +#ifdef O_CLOEXEC + if (options == nullptr || options->set_fd_cloexec) { + flags |= O_CLOEXEC; + } +#else + (void)options; +#endif + return flags; +} + +class PosixFileSystem : public FileSystem { + public: + PosixFileSystem(); + + static const char* kClassName() { return "PosixFileSystem"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + ~PosixFileSystem() override {} + bool IsInstanceOf(const std::string& name) const override { + if (name == "posix") { + return true; + } else { + return FileSystem::IsInstanceOf(name); + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSSequentialFile>* result, + IODebugContext* /*dbg*/) override { + result->reset(); + int fd = -1; + int flags = cloexec_flags(O_RDONLY, &options); + FILE* file = nullptr; + + if (options.use_direct_reads && !options.use_mmap_reads) { +#ifdef ROCKSDB_LITE + return IOStatus::IOError(fname, + "Direct I/O not supported in RocksDB lite"); +#endif // !ROCKSDB_LITE +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; + TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags); +#endif + } + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + return IOError("While opening a file for sequentially reading", fname, + errno); + } + + SetFD_CLOEXEC(fd, &options); + + if (options.use_direct_reads && !options.use_mmap_reads) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + return IOError("While fcntl NoCache", fname, errno); + } +#endif + } else { + do { + IOSTATS_TIMER_GUARD(open_nanos); + file = fdopen(fd, "r"); + } while (file == nullptr && errno == EINTR); + if (file == nullptr) { + close(fd); + return IOError("While opening file for sequentially read", fname, + errno); + } + } + result->reset(new PosixSequentialFile( + fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd), + options)); + return IOStatus::OK(); + } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* /*dbg*/) override { + result->reset(); + IOStatus s = IOStatus::OK(); + int fd; + int flags = cloexec_flags(O_RDONLY, &options); + + if (options.use_direct_reads && !options.use_mmap_reads) { +#ifdef ROCKSDB_LITE + return IOStatus::IOError(fname, + "Direct I/O not supported in RocksDB lite"); +#endif // !ROCKSDB_LITE +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; + TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags); +#endif + } + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("While open a file for random read", fname, errno); + return s; + } + SetFD_CLOEXEC(fd, &options); + + if (options.use_mmap_reads) { + // Use of mmap for random reads has been removed because it + // kills performance when storage is fast. + // Use mmap when virtual address-space is plentiful. + uint64_t size; + IOOptions opts; + s = GetFileSize(fname, opts, &size, nullptr); + if (s.ok()) { + void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + result->reset( + new PosixMmapReadableFile(fd, fname, base, size, options)); + } else { + s = IOError("while mmap file for read", fname, errno); + close(fd); + } + } else { + close(fd); + } + } else { + if (options.use_direct_reads && !options.use_mmap_reads) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + return IOError("while fcntl NoCache", fname, errno); + } +#endif + } + result->reset(new PosixRandomAccessFile( + fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd), + options +#if defined(ROCKSDB_IOURING_PRESENT) + , + !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get() +#endif + )); + } + return s; + } + + virtual IOStatus OpenWritableFile(const std::string& fname, + const FileOptions& options, bool reopen, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* /*dbg*/) { + result->reset(); + IOStatus s; + int fd = -1; + int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC); + // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) + if (options.use_direct_writes && !options.use_mmap_writes) { + // Note: we should avoid O_APPEND here due to ta the following bug: + // POSIX requires that opening a file with the O_APPEND flag should + // have no affect on the location at which pwrite() writes data. + // However, on Linux, if a file is opened with O_APPEND, pwrite() + // appends data to the end of the file, regardless of the value of + // offset. + // More info here: https://linux.die.net/man/2/pwrite +#ifdef ROCKSDB_LITE + return IOStatus::IOError(fname, + "Direct I/O not supported in RocksDB lite"); +#endif // ROCKSDB_LITE + flags |= O_WRONLY; +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; +#endif + TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags); + } else if (options.use_mmap_writes) { + // non-direct I/O + flags |= O_RDWR; + } else { + flags |= O_WRONLY; + } + + flags = cloexec_flags(flags, &options); + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + + if (fd < 0) { + s = IOError("While open a file for appending", fname, errno); + return s; + } + SetFD_CLOEXEC(fd, &options); + + if (options.use_mmap_writes) { + MaybeForceDisableMmap(fd); + } + if (options.use_mmap_writes && !forceMmapOff_) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else if (options.use_direct_writes && !options.use_mmap_writes) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + s = IOError("While fcntl NoCache an opened file for appending", fname, + errno); + return s; + } +#elif defined(OS_SOLARIS) + if (directio(fd, DIRECTIO_ON) == -1) { + if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON + close(fd); + s = IOError("While calling directio()", fname, errno); + return s; + } + } +#endif + result->reset(new PosixWritableFile( + fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), + options)); + } else { + // disable mmap writes + EnvOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + result->reset( + new PosixWritableFile(fname, fd, + GetLogicalBlockSizeForWriteIfNeeded( + no_mmap_writes_options, fname, fd), + no_mmap_writes_options)); + } + return s; + } + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + return OpenWritableFile(fname, options, false, result, dbg); + } + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + return OpenWritableFile(fname, options, true, result, dbg); + } + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* /*dbg*/) override { + result->reset(); + IOStatus s; + int fd = -1; + + int flags = 0; + // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) + if (options.use_direct_writes && !options.use_mmap_writes) { +#ifdef ROCKSDB_LITE + return IOStatus::IOError(fname, + "Direct I/O not supported in RocksDB lite"); +#endif // !ROCKSDB_LITE + flags |= O_WRONLY; +#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) + flags |= O_DIRECT; +#endif + TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags); + } else if (options.use_mmap_writes) { + // mmap needs O_RDWR mode + flags |= O_RDWR; + } else { + flags |= O_WRONLY; + } + + flags = cloexec_flags(flags, &options); + + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(old_fname.c_str(), flags, + GetDBFileMode(allow_non_owner_access_)); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("while reopen file for write", fname, errno); + return s; + } + + SetFD_CLOEXEC(fd, &options); + // rename into place + if (rename(old_fname.c_str(), fname.c_str()) != 0) { + s = IOError("while rename file to " + fname, old_fname, errno); + close(fd); + return s; + } + + if (options.use_mmap_writes) { + MaybeForceDisableMmap(fd); + } + if (options.use_mmap_writes && !forceMmapOff_) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else if (options.use_direct_writes && !options.use_mmap_writes) { +#ifdef OS_MACOSX + if (fcntl(fd, F_NOCACHE, 1) == -1) { + close(fd); + s = IOError("while fcntl NoCache for reopened file for append", fname, + errno); + return s; + } +#elif defined(OS_SOLARIS) + if (directio(fd, DIRECTIO_ON) == -1) { + if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON + close(fd); + s = IOError("while calling directio()", fname, errno); + return s; + } + } +#endif + result->reset(new PosixWritableFile( + fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), + options)); + } else { + // disable mmap writes + FileOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + result->reset( + new PosixWritableFile(fname, fd, + GetLogicalBlockSizeForWriteIfNeeded( + no_mmap_writes_options, fname, fd), + no_mmap_writes_options)); + } + return s; + } + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* /*dbg*/) override { + int fd = -1; + int flags = cloexec_flags(O_RDWR, &options); + + while (fd < 0) { + IOSTATS_TIMER_GUARD(open_nanos); + + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); + if (fd < 0) { + // Error while opening the file + if (errno == EINTR) { + continue; + } + return IOError("While open file for random read/write", fname, errno); + } + } + + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixRandomRWFile(fname, fd, options)); + return IOStatus::OK(); + } + + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<MemoryMappedFileBuffer>* result) override { + int fd = -1; + IOStatus status; + int flags = cloexec_flags(O_RDWR, nullptr); + + while (fd < 0) { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, 0644); + if (fd < 0) { + // Error while opening the file + if (errno == EINTR) { + continue; + } + status = + IOError("While open file for raw mmap buffer access", fname, errno); + break; + } + } + uint64_t size; + if (status.ok()) { + IOOptions opts; + status = GetFileSize(fname, opts, &size, nullptr); + } + void* base = nullptr; + if (status.ok()) { + base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + status = IOError("while mmap file for read", fname, errno); + } + } + if (status.ok()) { + result->reset( + new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size))); + } + if (fd >= 0) { + // don't need to keep it open after mmap has been called + close(fd); + } + return status; + } + + IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/, + std::unique_ptr<FSDirectory>* result, + IODebugContext* /*dbg*/) override { + result->reset(); + int fd; + int flags = cloexec_flags(0, nullptr); + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(name.c_str(), flags); + } + if (fd < 0) { + return IOError("While open directory", name, errno); + } else { + result->reset(new PosixDirectory(fd, name)); + } + return IOStatus::OK(); + } + + IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + int result = access(fname.c_str(), F_OK); + + if (result == 0) { + return IOStatus::OK(); + } + + int err = errno; + switch (err) { + case EACCES: + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + case ENOTDIR: + return IOStatus::NotFound(); + default: + assert(err == EIO || err == ENOMEM); + return IOStatus::IOError("Unexpected error(" + std::to_string(err) + + ") accessing file `" + fname + "' "); + } + } + + IOStatus GetChildren(const std::string& dir, const IOOptions& opts, + std::vector<std::string>* result, + IODebugContext* /*dbg*/) override { + result->clear(); + + DIR* d = opendir(dir.c_str()); + if (d == nullptr) { + switch (errno) { + case EACCES: + case ENOENT: + case ENOTDIR: + return IOStatus::NotFound(); + default: + return IOError("While opendir", dir, errno); + } + } + + // reset errno before calling readdir() + errno = 0; + struct dirent* entry; + + while ((entry = readdir(d)) != nullptr) { + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + entry->d_type == DT_DIR && + (strcmp(entry->d_name, ".") == 0 || + strcmp(entry->d_name, "..") == 0 +#ifndef ASSERT_STATUS_CHECKED + // In case of ASSERT_STATUS_CHECKED, GetChildren support older + // version of API for debugging purpose. + || opts.do_not_recurse +#endif + ); + if (!ignore) { + result->push_back(entry->d_name); + } + errno = 0; // reset errno if readdir() success + } + + // always attempt to close the dir + const auto pre_close_errno = errno; // errno may be modified by closedir + const int close_result = closedir(d); + + if (pre_close_errno != 0) { + // error occurred during readdir + return IOError("While readdir", dir, pre_close_errno); + } + + if (close_result != 0) { + // error occurred during closedir + return IOError("While closedir", dir, errno); + } + + return IOStatus::OK(); + } + + IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + IOStatus result; + if (unlink(fname.c_str()) != 0) { + result = IOError("while unlink() file", fname, errno); + } + return result; + } + + IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (mkdir(name.c_str(), 0755) != 0) { + return IOError("While mkdir", name, errno); + } + return IOStatus::OK(); + } + + IOStatus CreateDirIfMissing(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (mkdir(name.c_str(), 0755) != 0) { + if (errno != EEXIST) { + return IOError("While mkdir if missing", name, errno); + } else if (!DirExists(name)) { // Check that name is actually a + // directory. + // Message is taken from mkdir + return IOStatus::IOError("`" + name + + "' exists but is not a directory"); + } + } + return IOStatus::OK(); + } + + IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (rmdir(name.c_str()) != 0) { + return IOError("file rmdir", name, errno); + } + return IOStatus::OK(); + } + + IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/, + uint64_t* size, IODebugContext* /*dbg*/) override { + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + return IOError("while stat a file for size", fname, errno); + } else { + *size = sbuf.st_size; + } + return IOStatus::OK(); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& /*opts*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) override { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + return IOError("while stat a file for modification time", fname, errno); + } + *file_mtime = static_cast<uint64_t>(s.st_mtime); + return IOStatus::OK(); + } + + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (rename(src.c_str(), target.c_str()) != 0) { + return IOError("While renaming a file to " + target, src, errno); + } + return IOStatus::OK(); + } + + IOStatus LinkFile(const std::string& src, const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + if (link(src.c_str(), target.c_str()) != 0) { + if (errno == EXDEV || errno == ENOTSUP) { + return IOStatus::NotSupported(errno == EXDEV + ? "No cross FS links allowed" + : "Links not supported by FS"); + } + return IOError("while link file to " + target, src, errno); + } + return IOStatus::OK(); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/, + uint64_t* count, IODebugContext* /*dbg*/) override { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + return IOError("while stat a file for num file links", fname, errno); + } + *count = static_cast<uint64_t>(s.st_nlink); + return IOStatus::OK(); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& /*opts*/, bool* res, + IODebugContext* /*dbg*/) override { + struct stat statbuf[2]; + if (stat(first.c_str(), &statbuf[0]) != 0) { + return IOError("stat file", first, errno); + } + if (stat(second.c_str(), &statbuf[1]) != 0) { + return IOError("stat file", second, errno); + } + + if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) || + minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) || + statbuf[0].st_ino != statbuf[1].st_ino) { + *res = false; + } else { + *res = true; + } + return IOStatus::OK(); + } + + IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/, + FileLock** lock, IODebugContext* /*dbg*/) override { + *lock = nullptr; + + LockHoldingInfo lhi; + int64_t current_time = 0; + // Ignore status code as the time is only used for error message. + SystemClock::Default() + ->GetCurrentTime(¤t_time) + .PermitUncheckedError(); + lhi.acquire_time = current_time; + lhi.acquiring_thread = Env::Default()->GetThreadID(); + + mutex_locked_files.Lock(); + // If it already exists in the locked_files set, then it is already locked, + // and fail this lock attempt. Otherwise, insert it into locked_files. + // This check is needed because fcntl() does not detect lock conflict + // if the fcntl is issued by the same thread that earlier acquired + // this lock. + // We must do this check *before* opening the file: + // Otherwise, we will open a new file descriptor. Locks are associated with + // a process, not a file descriptor and when *any* file descriptor is + // closed, all locks the process holds for that *file* are released + const auto it_success = locked_files.insert({fname, lhi}); + if (it_success.second == false) { + LockHoldingInfo prev_info = it_success.first->second; + mutex_locked_files.Unlock(); + errno = ENOLCK; + // Note that the thread ID printed is the same one as the one in + // posix logger, but posix logger prints it hex format. + return IOError("lock hold by current process, acquire time " + + std::to_string(prev_info.acquire_time) + + " acquiring thread " + + std::to_string(prev_info.acquiring_thread), + fname, errno); + } + + IOStatus result = IOStatus::OK(); + int fd; + int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr); + + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), flags, 0644); + } + if (fd < 0) { + result = IOError("while open a file for lock", fname, errno); + } else if (LockOrUnlock(fd, true) == -1) { + result = IOError("While lock file", fname, errno); + close(fd); + } else { + SetFD_CLOEXEC(fd, nullptr); + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + my_lock->filename = fname; + *lock = my_lock; + } + if (!result.ok()) { + // If there is an error in locking, then remove the pathname from + // locked_files. (If we got this far, it did not exist in locked_files + // before this call.) + locked_files.erase(fname); + } + + mutex_locked_files.Unlock(); + return result; + } + + IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock); + IOStatus result; + mutex_locked_files.Lock(); + // If we are unlocking, then verify that we had locked it earlier, + // it should already exist in locked_files. Remove it from locked_files. + if (locked_files.erase(my_lock->filename) != 1) { + errno = ENOLCK; + result = IOError("unlock", my_lock->filename, errno); + } else if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = IOError("unlock", my_lock->filename, errno); + } + close(my_lock->fd_); + my_lock->Clear(); + delete my_lock; + mutex_locked_files.Unlock(); + return result; + } + + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*opts*/, std::string* output_path, + IODebugContext* /*dbg*/) override { + if (!db_path.empty() && db_path[0] == '/') { + *output_path = db_path; + return IOStatus::OK(); + } + + char the_path[4096]; + char* ret = getcwd(the_path, 4096); + if (ret == nullptr) { + return IOStatus::IOError(errnoStr(errno).c_str()); + } + + *output_path = ret; + return IOStatus::OK(); + } + + IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result, + IODebugContext* /*dbg*/) override { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + { + IOOptions opts; + return CreateDirIfMissing(*result, opts, nullptr); + } + return IOStatus::OK(); + } + + IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/, + uint64_t* free_space, + IODebugContext* /*dbg*/) override { + struct statvfs sbuf; + + if (statvfs(fname.c_str(), &sbuf) < 0) { + return IOError("While doing statvfs", fname, errno); + } + + // sbuf.bfree is total free space available to root + // sbuf.bavail is total free space available to unprivileged user + // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id + if (geteuid()) { + // non-zero user is unprivileged, or -1 if error. take more conservative + // size + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail); + } else { + // root user can access all disk space + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + } + return IOStatus::OK(); + } + + IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + // First open + int fd = -1; + int flags = cloexec_flags(O_RDONLY, nullptr); + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(path.c_str(), flags); + } + if (fd < 0) { + return IOError("While open for IsDirectory()", path, errno); + } + IOStatus io_s; + struct stat sbuf; + if (fstat(fd, &sbuf) < 0) { + io_s = IOError("While doing stat for IsDirectory()", path, errno); + } + close(fd); + if (io_s.ok() && nullptr != is_dir) { + *is_dir = S_ISDIR(sbuf.st_mode); + } + return io_s; + } + + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + FileOptions optimized = file_options; + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + optimized.bytes_per_sync = db_options.wal_bytes_per_sync; + // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it + // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit + // test and make this false + optimized.fallocate_with_keep_size = true; + optimized.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + return optimized; + } + + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + FileOptions optimized = file_options; + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } +#ifdef OS_LINUX + Status RegisterDbPaths(const std::vector<std::string>& paths) override { + return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths); + } + Status UnregisterDbPaths(const std::vector<std::string>& paths) override { + logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths); + return Status::OK(); + } +#endif + private: + bool forceMmapOff_ = false; // do we override Env options? + + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname) { + struct stat statbuf; + if (stat(dname.c_str(), &statbuf) == 0) { + return S_ISDIR(statbuf.st_mode); + } + return false; // stat() failed return false + } + + bool SupportsFastAllocate(int fd) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + struct statfs s; + if (fstatfs(fd, &s)) { + return false; + } + switch (s.f_type) { + case EXT4_SUPER_MAGIC: + return true; + case XFS_SUPER_MAGIC: + return true; + case TMPFS_MAGIC: + return true; + default: + return false; + } +#else + (void)fd; + return false; +#endif + } + + void MaybeForceDisableMmap(int fd) { + static std::once_flag s_check_disk_for_mmap_once; + assert(this == FileSystem::Default().get()); + std::call_once( + s_check_disk_for_mmap_once, + [this](int fdesc) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fdesc)) { + forceMmapOff_ = true; + } + }, + fd); + } + +#ifdef ROCKSDB_IOURING_PRESENT + bool IsIOUringEnabled() { + if (RocksDbIOUringEnable && RocksDbIOUringEnable()) { + return true; + } else { + return false; + } + } +#endif // ROCKSDB_IOURING_PRESENT + + // EXPERIMENTAL + // + // TODO akankshamahajan: + // 1. Update Poll API to take into account min_completions + // and returns if number of handles in io_handles (any order) completed is + // equal to atleast min_completions. + // 2. Currently in case of direct_io, Read API is called because of which call + // to Poll API fails as it expects IOHandle to be populated. + virtual IOStatus Poll(std::vector<void*>& io_handles, + size_t /*min_completions*/) override { +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring_queue_init. + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); + } + + // Init failed, platform doesn't support io_uring. + if (iu == nullptr) { + return IOStatus::NotSupported("Poll"); + } + + for (size_t i = 0; i < io_handles.size(); i++) { + // The request has been completed in earlier runs. + if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) { + continue; + } + // Loop until IO for io_handles[i] is completed. + while (true) { + // io_uring_wait_cqe. + struct io_uring_cqe* cqe = nullptr; + ssize_t ret = io_uring_wait_cqe(iu, &cqe); + if (ret) { + // abort as it shouldn't be in indeterminate state and there is no + // good way currently to handle this error. + abort(); + } + + // Step 3: Populate the request. + assert(cqe != nullptr); + Posix_IOHandle* posix_handle = + static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe)); + assert(posix_handle->iu == iu); + if (posix_handle->iu != iu) { + return IOStatus::IOError(""); + } + // Reset cqe data to catch any stray reuse of it + static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + + FSReadRequest req; + req.scratch = posix_handle->scratch; + req.offset = posix_handle->offset; + req.len = posix_handle->len; + + size_t finished_len = 0; + size_t bytes_read = 0; + bool read_again = false; + UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, + true /*async_read*/, posix_handle->use_direct_io, + posix_handle->alignment, finished_len, &req, bytes_read, + read_again); + posix_handle->is_finished = true; + io_uring_cqe_seen(iu, cqe); + posix_handle->cb(req, posix_handle->cb_arg); + + (void)finished_len; + (void)bytes_read; + (void)read_again; + + if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) { + break; + } + } + } + return IOStatus::OK(); +#else + (void)io_handles; + return IOStatus::NotSupported("Poll"); +#endif + } + + virtual IOStatus AbortIO(std::vector<void*>& io_handles) override { +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring_queue_init. + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); + } + + // Init failed, platform doesn't support io_uring. + // If Poll is not supported then it didn't submit any request and it should + // return OK. + if (iu == nullptr) { + return IOStatus::OK(); + } + + for (size_t i = 0; i < io_handles.size(); i++) { + Posix_IOHandle* posix_handle = + static_cast<Posix_IOHandle*>(io_handles[i]); + if (posix_handle->is_finished == true) { + continue; + } + assert(posix_handle->iu == iu); + if (posix_handle->iu != iu) { + return IOStatus::IOError(""); + } + + // Prepare the cancel request. + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(iu); + + // In order to cancel the request, sqe->addr of cancel request should + // match with the read request submitted which is posix_handle->iov. + io_uring_prep_cancel(sqe, &posix_handle->iov, 0); + // Sets sqe->user_data to posix_handle. + io_uring_sqe_set_data(sqe, posix_handle); + + // submit the request. + ssize_t ret = io_uring_submit(iu); + if (ret < 0) { + fprintf(stderr, "io_uring_submit error: %ld\n", long(ret)); + return IOStatus::IOError("io_uring_submit() requested but returned " + + std::to_string(ret)); + } + } + + // After submitting the requests, wait for the requests. + for (size_t i = 0; i < io_handles.size(); i++) { + if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) { + continue; + } + + while (true) { + struct io_uring_cqe* cqe = nullptr; + ssize_t ret = io_uring_wait_cqe(iu, &cqe); + if (ret) { + // abort as it shouldn't be in indeterminate state and there is no + // good way currently to handle this error. + abort(); + } + assert(cqe != nullptr); + + // Returns cqe->user_data. + Posix_IOHandle* posix_handle = + static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe)); + assert(posix_handle->iu == iu); + if (posix_handle->iu != iu) { + return IOStatus::IOError(""); + } + posix_handle->req_count++; + + // Reset cqe data to catch any stray reuse of it + static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + io_uring_cqe_seen(iu, cqe); + + // - If the request is cancelled successfully, the original request is + // completed with -ECANCELED and the cancel request is completed with + // a result of 0. + // - If the request was already running, the original may or + // may not complete in error. The cancel request will complete with + // -EALREADY for that case. + // - And finally, if the request to cancel wasn't + // found, the cancel request is completed with -ENOENT. + // + // Every handle has to wait for 2 requests completion: original one and + // the cancel request which is tracked by PosixHandle::req_count. + if (posix_handle->req_count == 2 && + static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) { + posix_handle->is_finished = true; + FSReadRequest req; + req.status = IOStatus::Aborted(); + posix_handle->cb(req, posix_handle->cb_arg); + + break; + } + } + } + return IOStatus::OK(); +#else + // If Poll is not supported then it didn't submit any request and it should + // return OK. + (void)io_handles; + return IOStatus::OK(); +#endif + } + +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring instance + std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_; +#endif + + size_t page_size_; + + // If true, allow non owner read access for db files. Otherwise, non-owner + // has no access to db files. + bool allow_non_owner_access_; + +#ifdef OS_LINUX + static LogicalBlockSizeCache logical_block_size_cache_; +#endif + static size_t GetLogicalBlockSize(const std::string& fname, int fd); + // In non-direct IO mode, this directly returns kDefaultPageSize. + // Otherwise call GetLogicalBlockSize. + static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options, + const std::string& fname, + int fd); + static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options, + const std::string& fname, + int fd); +}; + +#ifdef OS_LINUX +LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_; +#endif + +size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) { +#ifdef OS_LINUX + return logical_block_size_cache_.GetLogicalBlockSize(fname, fd); +#else + (void)fname; + return PosixHelper::GetLogicalBlockSizeOfFd(fd); +#endif +} + +size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded( + const EnvOptions& options, const std::string& fname, int fd) { + return options.use_direct_reads + ? PosixFileSystem::GetLogicalBlockSize(fname, fd) + : kDefaultPageSize; +} + +size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded( + const EnvOptions& options, const std::string& fname, int fd) { + return options.use_direct_writes + ? PosixFileSystem::GetLogicalBlockSize(fname, fd) + : kDefaultPageSize; +} + +PosixFileSystem::PosixFileSystem() + : forceMmapOff_(false), + page_size_(getpagesize()), + allow_non_owner_access_(true) { +#if defined(ROCKSDB_IOURING_PRESENT) + // Test whether IOUring is supported, and if it does, create a managing + // object for thread local point so that in the future thread-local + // io_uring can be created. + struct io_uring* new_io_uring = CreateIOUring(); + if (new_io_uring != nullptr) { + thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring)); + delete new_io_uring; + } +#endif +} + +} // namespace + +// +// Default Posix FileSystem +// +std::shared_ptr<FileSystem> FileSystem::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr<FileSystem>, instance) + (std::make_shared<PosixFileSystem>()); + return instance; +} + +#ifndef ROCKSDB_LITE +static FactoryFunc<FileSystem> posix_filesystem_reg = + ObjectLibrary::Default()->AddFactory<FileSystem>( + ObjectLibrary::PatternEntry("posix").AddSeparator("://", false), + [](const std::string& /* uri */, std::unique_ptr<FileSystem>* f, + std::string* /* errmsg */) { + f->reset(new PosixFileSystem()); + return f->get(); + }); +#endif + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/env/fs_readonly.h b/src/rocksdb/env/fs_readonly.h new file mode 100644 index 000000000..1bbe60784 --- /dev/null +++ b/src/rocksdb/env/fs_readonly.h @@ -0,0 +1,107 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// A FileSystem wrapper that only allows read-only operation. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class ReadOnlyFileSystem : public FileSystemWrapper { + static inline IOStatus FailReadOnly() { + IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem"); + assert(s.GetRetryable() == false); + return s; + } + + public: + explicit ReadOnlyFileSystem(const std::shared_ptr<FileSystem>& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { return "ReadOnlyFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewWritableFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr<FSWritableFile>* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus ReuseWritableFile(const std::string& /*fname*/, + const std::string& /*old_fname*/, + const FileOptions& /*options*/, + std::unique_ptr<FSWritableFile>* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr<FSRandomRWFile>* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewDirectory(const std::string& /*dir*/, + const IOOptions& /*options*/, + std::unique_ptr<FSDirectory>* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus DeleteFile(const std::string& /*fname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override { + // Allow if dir already exists + bool is_dir = false; + IOStatus s = IsDirectory(dirname, options, &is_dir, dbg); + if (s.ok() && is_dir) { + return s; + } else { + return FailReadOnly(); + } + } + IOStatus DeleteDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/, + FileLock** /*lock*/, IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/, + std::shared_ptr<Logger>* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/env/fs_remap.cc b/src/rocksdb/env/fs_remap.cc new file mode 100644 index 000000000..fd9241181 --- /dev/null +++ b/src/rocksdb/env/fs_remap.cc @@ -0,0 +1,343 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "env/fs_remap.h" + +namespace ROCKSDB_NAMESPACE { + +RemapFileSystem::RemapFileSystem(const std::shared_ptr<FileSystem>& base) + : FileSystemWrapper(base) {} + +std::pair<IOStatus, std::string> RemapFileSystem::EncodePathWithNewBasename( + const std::string& path) { + // No difference by default + return EncodePath(path); +} + +Status RemapFileSystem::RegisterDbPaths(const std::vector<std::string>& paths) { + std::vector<std::string> encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::RegisterDbPaths(encoded_paths); +} + +Status RemapFileSystem::UnregisterDbPaths( + const std::vector<std::string>& paths) { + std::vector<std::string> encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::UnregisterDbPaths(encoded_paths); +} + +IOStatus RemapFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + auto status_and_old_enc_path = EncodePath(old_fname); + if (!status_and_old_enc_path.first.ok()) { + return status_and_old_enc_path.first; + } + return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second, + status_and_old_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::NewDirectory(const std::string& dir, + const IOOptions& options, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) { + // A hassle to remap DirFsyncOptions::renamed_new_name + class RemapFSDirectory : public FSDirectoryWrapper { + public: + RemapFSDirectory(RemapFileSystem* fs, std::unique_ptr<FSDirectory>&& t) + : FSDirectoryWrapper(std::move(t)), fs_(fs) {} + IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override { + if (dir_fsync_options.renamed_new_name.empty()) { + return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, + dir_fsync_options); + } else { + auto status_and_enc_path = + fs_->EncodePath(dir_fsync_options.renamed_new_name); + if (status_and_enc_path.first.ok()) { + DirFsyncOptions mapped_options = dir_fsync_options; + mapped_options.renamed_new_name = status_and_enc_path.second; + return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, + mapped_options); + } else { + return status_and_enc_path.first; + } + } + } + + private: + RemapFileSystem* const fs_; + }; + + auto status_and_enc_path = EncodePathWithNewBasename(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + IOStatus ios = FileSystemWrapper::NewDirectory(status_and_enc_path.second, + options, result, dbg); + if (ios.ok()) { + *result = std::make_unique<RemapFSDirectory>(this, std::move(*result)); + } + return ios; +} + +IOStatus RemapFileSystem::FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::FileExists(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::GetChildren(const std::string& dir, + const IOOptions& options, + std::vector<std::string>* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildren(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector<FileAttributes>* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildrenFileAttributes( + status_and_enc_path.second, options, result, dbg); +} + +IOStatus RemapFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second, + options, dbg); +} + +IOStatus RemapFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options, + file_size, dbg); +} + +IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second, + options, file_mtime, dbg); +} + +IOStatus RemapFileSystem::IsDirectory(const std::string& path, + const IOOptions& options, bool* is_dir, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options, + is_dir, dbg); +} + +IOStatus RemapFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + if (status_and_src_enc_path.first.IsNotFound()) { + const IOStatus& s = status_and_src_enc_path.first; + status_and_src_enc_path.first = IOStatus::PathNotFound(s.ToString()); + } + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::RenameFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::LinkFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LockFile(const std::string& fname, + const IOOptions& options, FileLock** lock, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + // FileLock subclasses may store path (e.g., PosixFileLock stores it). We + // can skip stripping the chroot directory from this path because callers + // shouldn't use it. + return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock, + dbg); +} + +IOStatus RemapFileSystem::NewLogger(const std::string& fname, + const IOOptions& options, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewLogger(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(db_path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options, + output_path, dbg); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/env/fs_remap.h b/src/rocksdb/env/fs_remap.h new file mode 100644 index 000000000..1f6e061fd --- /dev/null +++ b/src/rocksdb/env/fs_remap.h @@ -0,0 +1,139 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <utility> + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// An abstract FileSystem wrapper that creates a view of an existing +// FileSystem by remapping names in some way. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class RemapFileSystem : public FileSystemWrapper { + public: + explicit RemapFileSystem(const std::shared_ptr<FileSystem>& base); + + protected: + // Returns status and mapped-to path in the wrapped filesystem. + // If it returns non-OK status, the returned path should not be used. + virtual std::pair<IOStatus, std::string> EncodePath( + const std::string& path) = 0; + + // Similar to EncodePath() except used in cases in which it is OK for + // no file or directory on 'path' to already exist, such as if the + // operation would create one. However, the parent of 'path' is expected + // to exist for the operation to succeed. + // Default implementation: call EncodePath + virtual std::pair<IOStatus, std::string> EncodePathWithNewBasename( + const std::string& path); + + public: + // Left abstract: + // const char* Name() const override { ... } + static const char* kClassName() { return "RemapFileSystem"; } + bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return FileSystemWrapper::IsInstanceOf(id); + } + } + + Status RegisterDbPaths(const std::vector<std::string>& paths) override; + + Status UnregisterDbPaths(const std::vector<std::string>& paths) override; + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSSequentialFile>* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& dir, const IOOptions& options, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) override; + + IOStatus FileExists(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector<std::string>* result, + IODebugContext* dbg) override; + + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector<FileAttributes>* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override; + + IOStatus RenameFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LinkFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) override; + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/env/io_posix.cc b/src/rocksdb/env/io_posix.cc new file mode 100644 index 000000000..0ec0e9c83 --- /dev/null +++ b/src/rocksdb/env/io_posix.cc @@ -0,0 +1,1733 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef ROCKSDB_LIB_IO_POSIX +#include "env/io_posix.h" + +#include <errno.h> +#include <fcntl.h> + +#include <algorithm> +#if defined(OS_LINUX) +#include <linux/fs.h> +#ifndef FALLOC_FL_KEEP_SIZE +#include <linux/falloc.h> +#endif +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#ifdef OS_LINUX +#include <sys/statfs.h> +#include <sys/sysmacros.h> +#endif +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "util/autovector.h" +#include "util/coding.h" +#include "util/string_util.h" + +#if defined(OS_LINUX) && !defined(F_SET_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#endif + +namespace ROCKSDB_NAMESPACE { + +std::string IOErrorMsg(const std::string& context, + const std::string& file_name) { + if (file_name.empty()) { + return context; + } + return context + ": " + file_name; +} + +// file_name can be left empty if it is not unkown. +IOStatus IOError(const std::string& context, const std::string& file_name, + int err_number) { + switch (err_number) { + case ENOSPC: { + IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + s.SetRetryable(true); + return s; + } + case ESTALE: + return IOStatus::IOError(IOStatus::kStaleFile); + case ENOENT: + return IOStatus::PathNotFound(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + default: + return IOStatus::IOError(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + } +} + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return 0. +int Fadvise(int fd, off_t offset, size_t len, int advice) { +#ifdef OS_LINUX + return posix_fadvise(fd, offset, len, advice); +#else + (void)fd; + (void)offset; + (void)len; + (void)advice; + return 0; // simply do nothing. +#endif +} + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return 0. +int Madvise(void* addr, size_t len, int advice) { +#ifdef OS_LINUX + return posix_madvise(addr, len, advice); +#else + (void)addr; + (void)len; + (void)advice; + return 0; // simply do nothing. +#endif +} + +namespace { + +// On MacOS (and probably *BSD), the posix write and pwrite calls do not support +// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by +// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep +// the writes aligned. + +bool PosixWrite(int fd, const char* buf, size_t nbyte) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = write(fd, src, bytes_to_write); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + src += done; + } + return true; +} + +bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = pwrite(fd, src, bytes_to_write, offset); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + offset += done; + src += done; + } + + return true; +} + +#ifdef ROCKSDB_RANGESYNC_PRESENT + +#if !defined(ZFS_SUPER_MAGIC) +// The magic number for ZFS was not exposed until recently. It should be fixed +// forever so we can just copy the magic number here. +#define ZFS_SUPER_MAGIC 0x2fc12fc1 +#endif + +bool IsSyncFileRangeSupported(int fd) { + // This function tracks and checks for cases where we know `sync_file_range` + // definitely will not work properly despite passing the compile-time check + // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks + // fail in unexpected ways, we allow `sync_file_range` to be used. This way + // should minimize risk of impacting existing use cases. + struct statfs buf; + int ret = fstatfs(fd, &buf); + assert(ret == 0); + if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) { + // Testing on ZFS showed the writeback did not happen asynchronously when + // `sync_file_range` was called, even though it returned success. Avoid it + // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`, + // even though this'll incur extra I/O for metadata. + return false; + } + + ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */); + assert(!(ret == -1 && errno != ENOSYS)); + if (ret == -1 && errno == ENOSYS) { + // `sync_file_range` is not implemented on all platforms even if + // compile-time checks pass and a supported filesystem is in-use. For + // example, using ext4 on WSL (Windows Subsystem for Linux), + // `sync_file_range()` returns `ENOSYS` + // ("Function not implemented"). + return false; + } + // None of the known cases matched, so allow `sync_file_range` use. + return true; +} + +#undef ZFS_SUPER_MAGIC + +#endif // ROCKSDB_RANGESYNC_PRESENT + +} // anonymous namespace + +/* + * PosixSequentialFile + */ +PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file, + int fd, size_t logical_block_size, + const EnvOptions& options) + : filename_(fname), + file_(file), + fd_(fd), + use_direct_io_(options.use_direct_reads), + logical_sector_size_(logical_block_size) { + assert(!options.use_direct_reads || !options.use_mmap_reads); +} + +PosixSequentialFile::~PosixSequentialFile() { + if (!use_direct_io()) { + assert(file_); + fclose(file_); + } else { + assert(fd_); + close(fd_); + } +} + +IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + assert(result != nullptr && !use_direct_io()); + IOStatus s; + size_t r = 0; + do { + clearerr(file_); + r = fread_unlocked(scratch, 1, n, file_); + } while (r == 0 && ferror(file_) && errno == EINTR); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + // We also clear the error so that the reads can continue + // if a new data is written to the file + clearerr(file_); + } else { + // A partial read with an error: return a non-ok status + s = IOError("While reading file sequentially", filename_, errno); + } + } + return s; +} + +IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + assert(use_direct_io()); + assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(n, GetRequiredBufferAlignment())); + assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); + + IOStatus s; + ssize_t r = -1; + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast<off_t>(offset)); + if (r <= 0) { + if (r == -1 && errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + if (!IsSectorAligned(r, GetRequiredBufferAlignment())) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + break; + } + } + if (r < 0) { + // An error: return a non-ok status + s = IOError("While pread " + std::to_string(n) + " bytes from offset " + + std::to_string(offset), + filename_, errno); + } + *result = Slice(scratch, (r < 0) ? 0 : n - left); + return s; +} + +IOStatus PosixSequentialFile::Skip(uint64_t n) { + if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) { + return IOError("While fseek to skip " + std::to_string(n) + " bytes", + filename_, errno); + } + return IOStatus::OK(); +} + +IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + if (!use_direct_io()) { + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret != 0) { + return IOError("While fadvise NotNeeded offset " + + std::to_string(offset) + " len " + + std::to_string(length), + filename_, errno); + } + } + return IOStatus::OK(); +#endif +} + +/* + * PosixRandomAccessFile + */ +#if defined(OS_LINUX) +size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length * 3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd, FS_IOC_GETVERSION, &version); + TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast<size_t>(rid - id); +} +#endif + +#if defined(OS_MACOSX) || defined(OS_AIX) +size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length * 3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, buf.st_gen); + assert(rid >= id); + return static_cast<size_t>(rid - id); +} +#endif + +#ifdef OS_LINUX +std::string RemoveTrailingSlash(const std::string& path) { + std::string p = path; + if (p.size() > 1 && p.back() == '/') { + p.pop_back(); + } + return p; +} + +Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize( + const std::vector<std::string>& directories) { + std::vector<std::string> dirs; + dirs.reserve(directories.size()); + for (auto& d : directories) { + dirs.emplace_back(RemoveTrailingSlash(d)); + } + + std::map<std::string, size_t> dir_sizes; + { + ReadLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + if (cache_.find(dir) == cache_.end()) { + dir_sizes.emplace(dir, 0); + } + } + } + + Status s; + for (auto& dir_size : dir_sizes) { + s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second); + if (!s.ok()) { + return s; + } + } + + WriteLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + auto& v = cache_[dir]; + v.ref++; + auto dir_size = dir_sizes.find(dir); + if (dir_size != dir_sizes.end()) { + v.size = dir_size->second; + } + } + return s; +} + +void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize( + const std::vector<std::string>& directories) { + std::vector<std::string> dirs; + dirs.reserve(directories.size()); + for (auto& dir : directories) { + dirs.emplace_back(RemoveTrailingSlash(dir)); + } + + WriteLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + auto it = cache_.find(dir); + if (it != cache_.end() && !(--(it->second.ref))) { + cache_.erase(it); + } + } +} + +size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, + int fd) { + std::string dir = fname.substr(0, fname.find_last_of("/")); + if (dir.empty()) { + dir = "/"; + } + { + ReadLock lock(&cache_mutex_); + auto it = cache_.find(dir); + if (it != cache_.end()) { + return it->second.size; + } + } + return get_logical_block_size_of_fd_(fd); +} +#endif + +Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory, + size_t* size) { + int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY); + if (fd == -1) { + close(fd); + return Status::IOError("Cannot open directory " + directory); + } + *size = PosixHelper::GetLogicalBlockSizeOfFd(fd); + close(fd); + return Status::OK(); +} + +size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { +#ifdef OS_LINUX + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return kDefaultPageSize; + } + if (major(buf.st_dev) == 0) { + // Unnamed devices (e.g. non-device mounts), reserved as null device number. + // These don't have an entry in /sys/dev/block/. Return a sensible default. + return kDefaultPageSize; + } + + // Reading queue/logical_block_size does not require special permissions. + const int kBufferSize = 100; + char path[kBufferSize]; + char real_path[PATH_MAX + 1]; + snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), + minor(buf.st_dev)); + if (realpath(path, real_path) == nullptr) { + return kDefaultPageSize; + } + std::string device_dir(real_path); + if (!device_dir.empty() && device_dir.back() == '/') { + device_dir.pop_back(); + } + // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda + // and nvme0n1 have it. + // $ ls -al '/sys/dev/block/8:3' + // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> + // ../../block/sda/sda3 + // $ ls -al '/sys/dev/block/259:4' + // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> + // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 + size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); + if (parent_end == std::string::npos) { + return kDefaultPageSize; + } + size_t parent_begin = device_dir.rfind('/', parent_end - 1); + if (parent_begin == std::string::npos) { + return kDefaultPageSize; + } + std::string parent = + device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); + std::string child = device_dir.substr(parent_end + 1, std::string::npos); + if (parent != "block" && + (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { + device_dir = device_dir.substr(0, parent_end); + } + std::string fname = device_dir + "/queue/logical_block_size"; + FILE* fp; + size_t size = 0; + fp = fopen(fname.c_str(), "r"); + if (fp != nullptr) { + char* line = nullptr; + size_t len = 0; + if (getline(&line, &len, fp) != -1) { + sscanf(line, "%zu", &size); + } + free(line); + fclose(fp); + } + if (size != 0 && (size & (size - 1)) == 0) { + return size; + } +#endif + (void)fd; + return kDefaultPageSize; +} + +/* + * PosixRandomAccessFile + * + * pread() based random-access + */ +PosixRandomAccessFile::PosixRandomAccessFile( + const std::string& fname, int fd, size_t logical_block_size, + const EnvOptions& options +#if defined(ROCKSDB_IOURING_PRESENT) + , + ThreadLocalPtr* thread_local_io_urings +#endif + ) + : filename_(fname), + fd_(fd), + use_direct_io_(options.use_direct_reads), + logical_sector_size_(logical_block_size) +#if defined(ROCKSDB_IOURING_PRESENT) + , + thread_local_io_urings_(thread_local_io_urings) +#endif +{ + assert(!options.use_direct_reads || !options.use_mmap_reads); + assert(!options.use_mmap_reads); +} + +PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } + +IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& /*opts*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + if (use_direct_io()) { + assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(n, GetRequiredBufferAlignment())); + assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); + } + IOStatus s; + ssize_t r = -1; + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast<off_t>(offset)); + if (r <= 0) { + if (r == -1 && errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + if (use_direct_io() && + r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + break; + } + } + if (r < 0) { + // An error: return a non-ok status + s = IOError("While pread offset " + std::to_string(offset) + " len " + + std::to_string(n), + filename_, errno); + } + *result = Slice(scratch, (r < 0) ? 0 : n - left); + return s; +} + +IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + if (use_direct_io()) { + for (size_t i = 0; i < num_reqs; i++) { + assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment())); + assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment())); + } + } + +#if defined(ROCKSDB_IOURING_PRESENT) + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); + if (iu == nullptr) { + iu = CreateIOUring(); + if (iu != nullptr) { + thread_local_io_urings_->Reset(iu); + } + } + } + + // Init failed, platform doesn't support io_uring. Fall back to + // serialized reads + if (iu == nullptr) { + return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); + } + + IOStatus ios = IOStatus::OK(); + + struct WrappedReadRequest { + FSReadRequest* req; + struct iovec iov; + size_t finished_len; + explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {} + }; + + autovector<WrappedReadRequest, 32> req_wraps; + autovector<WrappedReadRequest*, 4> incomplete_rq_list; + std::unordered_set<WrappedReadRequest*> wrap_cache; + + for (size_t i = 0; i < num_reqs; i++) { + req_wraps.emplace_back(&reqs[i]); + } + + size_t reqs_off = 0; + while (num_reqs > reqs_off || !incomplete_rq_list.empty()) { + size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size(); + + // If requests exceed depth, split it into batches + if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth; + + assert(incomplete_rq_list.size() <= this_reqs); + for (size_t i = 0; i < this_reqs; i++) { + WrappedReadRequest* rep_to_submit; + if (i < incomplete_rq_list.size()) { + rep_to_submit = incomplete_rq_list[i]; + } else { + rep_to_submit = &req_wraps[reqs_off++]; + } + assert(rep_to_submit->req->len > rep_to_submit->finished_len); + rep_to_submit->iov.iov_base = + rep_to_submit->req->scratch + rep_to_submit->finished_len; + rep_to_submit->iov.iov_len = + rep_to_submit->req->len - rep_to_submit->finished_len; + + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(iu); + io_uring_prep_readv( + sqe, fd_, &rep_to_submit->iov, 1, + rep_to_submit->req->offset + rep_to_submit->finished_len); + io_uring_sqe_set_data(sqe, rep_to_submit); + wrap_cache.emplace(rep_to_submit); + } + incomplete_rq_list.clear(); + + ssize_t ret = + io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs)); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + &ret); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + iu); + + if (static_cast<size_t>(ret) != this_reqs) { + fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); + // If error happens and we submitted fewer than expected, it is an + // exception case and we don't retry here. We should still consume + // what is is submitted in the ring. + for (ssize_t i = 0; i < ret; i++) { + struct io_uring_cqe* cqe = nullptr; + io_uring_wait_cqe(iu, &cqe); + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + } + return IOStatus::IOError("io_uring_submit_and_wait() requested " + + std::to_string(this_reqs) + " but returned " + + std::to_string(ret)); + } + + for (size_t i = 0; i < this_reqs; i++) { + struct io_uring_cqe* cqe = nullptr; + WrappedReadRequest* req_wrap; + + // We could use the peek variant here, but this seems safer in terms + // of our initial wait not reaping all completions + ret = io_uring_wait_cqe(iu, &cqe); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret); + if (ret) { + ios = IOStatus::IOError("io_uring_wait_cqe() returns " + + std::to_string(ret)); + + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + continue; + } + + req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe)); + // Reset cqe data to catch any stray reuse of it + static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + // Check that we got a valid unique cqe data + auto wrap_check = wrap_cache.find(req_wrap); + if (wrap_check == wrap_cache.end()) { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "Bad cqe data from IO uring - %p\n", + req_wrap); + port::PrintStack(); + ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + + std::to_string((uint64_t)req_wrap)); + continue; + } + wrap_cache.erase(wrap_check); + + FSReadRequest* req = req_wrap->req; + size_t bytes_read = 0; + bool read_again = false; + UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len, + false /*async_read*/, use_direct_io(), + GetRequiredBufferAlignment(), req_wrap->finished_len, req, + bytes_read, read_again); + int32_t res = cqe->res; + if (res >= 0) { + if (bytes_read == 0) { + if (read_again) { + Slice tmp_slice; + req->status = + Read(req->offset + req_wrap->finished_len, + req->len - req_wrap->finished_len, options, &tmp_slice, + req->scratch + req_wrap->finished_len, dbg); + req->result = + Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + } + // else It means EOF so no need to do anything. + } else if (bytes_read < req_wrap->iov.iov_len) { + incomplete_rq_list.push_back(req_wrap); + } + } + io_uring_cqe_seen(iu, cqe); + } + wrap_cache.clear(); + } + return ios; +#else + return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); +#endif +} + +IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + if (!use_direct_io()) { + ssize_t r = 0; +#ifdef OS_LINUX + r = readahead(fd_, offset, n); +#endif +#ifdef OS_MACOSX + radvisory advice; + advice.ra_offset = static_cast<off_t>(offset); + advice.ra_count = static_cast<int>(n); + r = fcntl(fd_, F_RDADVISE, &advice); +#endif + if (r == -1) { + s = IOError("While prefetching offset " + std::to_string(offset) + + " len " + std::to_string(n), + filename_, errno); + } + } + return s; +} + +#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) +size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); +} +#endif + +void PosixRandomAccessFile::Hint(AccessPattern pattern) { + if (use_direct_io()) { + return; + } + switch (pattern) { + case kNormal: + Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case kRandom: + Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case kSequential: + Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case kWillNeed: + Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case kWontNeed: + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } +} + +IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + if (use_direct_io()) { + return IOStatus::OK(); + } +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise NotNeeded offset " + std::to_string(offset) + + " len " + std::to_string(length), + filename_, errno); +#endif +} + +IOStatus PosixRandomAccessFile::ReadAsync( + FSReadRequest& req, const IOOptions& /*opts*/, + std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { + if (use_direct_io()) { + assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(req.len, GetRequiredBufferAlignment())); + assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment())); + } + +#if defined(ROCKSDB_IOURING_PRESENT) + // io_uring_queue_init. + struct io_uring* iu = nullptr; + if (thread_local_io_urings_) { + iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); + if (iu == nullptr) { + iu = CreateIOUring(); + if (iu != nullptr) { + thread_local_io_urings_->Reset(iu); + } + } + } + + // Init failed, platform doesn't support io_uring. + if (iu == nullptr) { + return IOStatus::NotSupported("ReadAsync"); + } + + // Allocate io_handle. + IOHandleDeleter deletefn = [](void* args) -> void { + delete (static_cast<Posix_IOHandle*>(args)); + args = nullptr; + }; + + // Initialize Posix_IOHandle. + Posix_IOHandle* posix_handle = + new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch, + use_direct_io(), GetRequiredBufferAlignment()); + posix_handle->iov.iov_base = req.scratch; + posix_handle->iov.iov_len = req.len; + + *io_handle = static_cast<void*>(posix_handle); + *del_fn = deletefn; + + // Step 3: io_uring_sqe_set_data + struct io_uring_sqe* sqe; + sqe = io_uring_get_sqe(iu); + + io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov, + /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset); + + // Sets sqe->user_data to posix_handle. + io_uring_sqe_set_data(sqe, posix_handle); + + // Step 4: io_uring_submit + ssize_t ret = io_uring_submit(iu); + if (ret < 0) { + fprintf(stderr, "io_uring_submit error: %ld\n", long(ret)); + return IOStatus::IOError("io_uring_submit() requested but returned " + + std::to_string(ret)); + } + return IOStatus::OK(); +#else + (void)req; + (void)cb; + (void)cb_arg; + (void)io_handle; + (void)del_fn; + return IOStatus::NotSupported("ReadAsync"); +#endif +} + +/* + * PosixMmapReadableFile + * + * mmap() based random-access + */ +// base[0,length-1] contains the mmapped contents of the file. +PosixMmapReadableFile::PosixMmapReadableFile(const int fd, + const std::string& fname, + void* base, size_t length, + const EnvOptions& options) + : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { +#ifdef NDEBUG + (void)options; +#endif + fd_ = fd_ + 0; // suppress the warning for used variables + assert(options.use_mmap_reads); + assert(!options.use_direct_reads); +} + +PosixMmapReadableFile::~PosixMmapReadableFile() { + int ret = munmap(mmapped_region_, length_); + if (ret != 0) { + fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n", + mmapped_region_, length_); + } + close(fd_); +} + +IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n, + const IOOptions& /*opts*/, Slice* result, + char* /*scratch*/, + IODebugContext* /*dbg*/) const { + IOStatus s; + if (offset > length_) { + *result = Slice(); + return IOError("While mmap read offset " + std::to_string(offset) + + " larger than file length " + std::to_string(length_), + filename_, EINVAL); + } else if (offset + n > length_) { + n = static_cast<size_t>(length_ - offset); + } + *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n); + return s; +} + +void PosixMmapReadableFile::Hint(AccessPattern pattern) { + switch (pattern) { + case kNormal: + Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL); + break; + case kRandom: + Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM); + break; + case kSequential: + Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL); + break; + case kWillNeed: + Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED); + break; + case kWontNeed: + Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED); + break; + default: + assert(false); + break; + } +} + +IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise not needed. Offset " + std::to_string(offset) + + " len" + std::to_string(length), + filename_, errno); +#endif +} + +/* + * PosixMmapFile + * + * We preallocate up to an extra megabyte and use memcpy to append new + * data to the file. This is safe since we either properly close the + * file before reading from it, or for log files, the reading code + * knows enough to skip zero suffixes. + */ +IOStatus PosixMmapFile::UnmapCurrentRegion() { + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); + if (base_ != nullptr) { + int munmap_status = munmap(base_, limit_ - base_); + if (munmap_status != 0) { + return IOError("While munmap", filename_, munmap_status); + } + file_offset_ += limit_ - base_; + base_ = nullptr; + limit_ = nullptr; + last_sync_ = nullptr; + dst_ = nullptr; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1 << 20)) { + map_size_ *= 2; + } + } + return IOStatus::OK(); +} + +IOStatus PosixMmapFile::MapNewRegion() { +#ifdef ROCKSDB_FALLOCATE_PRESENT + assert(base_ == nullptr); + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + if (allow_fallocate_) { + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } + if (alloc_status != 0) { + return IOStatus::IOError("Error allocating space to file : " + filename_ + + "Error : " + errnoStr(alloc_status).c_str()); + } + } + + TEST_KILL_RANDOM("PosixMmapFile::Append:1"); + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, + file_offset_); + if (ptr == MAP_FAILED) { + return IOStatus::IOError("MMap failed on " + filename_); + } + TEST_KILL_RANDOM("PosixMmapFile::Append:2"); + + base_ = reinterpret_cast<char*>(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return IOStatus::OK(); +#else + return IOStatus::NotSupported("This platform doesn't support fallocate()"); +#endif +} + +IOStatus PosixMmapFile::Msync() { + if (dst_ == last_sync_) { + return IOStatus::OK(); + } + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + TEST_KILL_RANDOM("PosixMmapFile::Msync:0"); + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + return IOError("While msync", filename_, errno); + } + return IOStatus::OK(); +} + +PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(nullptr), + limit_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + allow_fallocate_ = options.allow_fallocate; + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#else + (void)options; +#endif + assert((page_size & (page_size - 1)) == 0); + assert(options.use_mmap_writes); + assert(!options.use_direct_writes); +} + +PosixMmapFile::~PosixMmapFile() { + if (fd_ >= 0) { + IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + IOStatus s = UnmapCurrentRegion(); + if (!s.ok()) { + return s; + } + s = MapNewRegion(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM("PosixMmapFile::Append:0"); + } + + size_t n = (left <= avail) ? left : avail; + assert(dst_); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return IOStatus::OK(); +} + +IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + size_t unused = limit_ - dst_; + + s = UnmapCurrentRegion(); + if (!s.ok()) { + s = IOError("While closing mmapped file", filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError("While ftruncating mmaped file", filename_, errno); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError("While closing mmapped file", filename_, errno); + } + } + + fd_ = -1; + base_ = nullptr; + limit_ = nullptr; + return s; +} + +IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fdatasync(fd_) < 0) { + return IOError("While fdatasync mmapped file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + + return Msync(); +} + +/** + * Flush data as well as metadata to stable storage. + */ +IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fsync(fd_) < 0) { + return IOError("While fsync mmaped file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + + return Msync(); +} + +/** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + size_t used = dst_ - base_; + return file_offset_ + used; +} + +IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise NotNeeded mmapped file", filename_, errno); +#endif +} + +#ifdef ROCKSDB_FALLOCATE_PRESENT +IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + TEST_KILL_RANDOM("PosixMmapFile::Allocate:0"); + int alloc_status = 0; + if (allow_fallocate_) { + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast<off_t>(offset), static_cast<off_t>(len)); + } + if (alloc_status == 0) { + return IOStatus::OK(); + } else { + return IOError("While fallocate offset " + std::to_string(offset) + + " len " + std::to_string(len), + filename_, errno); + } +} +#endif + +/* + * PosixWritableFile + * + * Use posix write to write data to a file. + */ +PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, + size_t logical_block_size, + const EnvOptions& options) + : FSWritableFile(options), + filename_(fname), + use_direct_io_(options.use_direct_writes), + fd_(fd), + filesize_(0), + logical_sector_size_(logical_block_size) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + allow_fallocate_ = options.allow_fallocate; + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif +#ifdef ROCKSDB_RANGESYNC_PRESENT + sync_file_range_supported_ = IsSyncFileRangeSupported(fd_); +#endif // ROCKSDB_RANGESYNC_PRESENT + assert(!options.use_mmap_writes); +} + +PosixWritableFile::~PosixWritableFile() { + if (fd_ >= 0) { + IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + if (use_direct_io()) { + assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); + assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); + } + const char* src = data.data(); + size_t nbytes = data.size(); + + if (!PosixWrite(fd_, src, nbytes)) { + return IOError("While appending to file", filename_, errno); + } + + filesize_ += nbytes; + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + if (use_direct_io()) { + assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); + assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); + } + assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + const char* src = data.data(); + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) { + return IOError("While pwrite to file at offset " + std::to_string(offset), + filename_, errno); + } + filesize_ = offset + nbytes; + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + int r = ftruncate(fd_, size); + if (r < 0) { + s = IOError("While ftruncate file to size " + std::to_string(size), + filename_, errno); + } else { + filesize_ = size; + } + return s; +} + +IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + + size_t block_size; + size_t last_allocated_block; + GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block); + if (last_allocated_block > 0) { + // trim the extra space preallocated at the end of the file + // NOTE(ljin): we probably don't want to surface failure as an IOError, + // but it will be nice to log these errors. + int dummy __attribute__((__unused__)); + dummy = ftruncate(fd_, filesize_); +#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) + // in some file systems, ftruncate only trims trailing space if the + // new file size is smaller than the current size. Calling fallocate + // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused + // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following + // filesystems: + // XFS (since Linux 2.6.38) + // ext4 (since Linux 3.0) + // Btrfs (since Linux 3.7) + // tmpfs (since Linux 3.5) + // We ignore error since failure of this operation does not affect + // correctness. + struct stat file_stats; + int result = fstat(fd_, &file_stats); + // After ftruncate, we check whether ftruncate has the correct behavior. + // If not, we should hack it with FALLOC_FL_PUNCH_HOLE + if (result == 0 && + (file_stats.st_size + file_stats.st_blksize - 1) / + file_stats.st_blksize != + file_stats.st_blocks / (file_stats.st_blksize / 512)) { + IOSTATS_TIMER_GUARD(allocate_nanos); + if (allow_fallocate_) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, + block_size * last_allocated_block - filesize_); + } + } +#endif + } + + if (close(fd_) < 0) { + s = IOError("While closing file after writing", filename_, errno); + } + fd_ = -1; + return s; +} + +// write out the cached data to the OS cache +IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fdatasync(fd_) < 0) { + return IOError("While fdatasync", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fsync(fd_) < 0) { + return IOError("While fsync", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +bool PosixWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return filesize_; +} + +void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { +#ifdef OS_LINUX +// Suppress Valgrind "Unimplemented functionality" error. +#ifndef ROCKSDB_VALGRIND_RUN + if (hint == write_hint_) { + return; + } + if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) { + write_hint_ = hint; + } +#else + (void)hint; +#endif // ROCKSDB_VALGRIND_RUN +#else + (void)hint; +#endif // OS_LINUX +} + +IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) { + if (use_direct_io()) { + return IOStatus::OK(); + } +#ifndef OS_LINUX + (void)offset; + (void)length; + return IOStatus::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return IOStatus::OK(); + } + return IOError("While fadvise NotNeeded", filename_, errno); +#endif +} + +#ifdef ROCKSDB_FALLOCATE_PRESENT +IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + TEST_KILL_RANDOM("PosixWritableFile::Allocate:0"); + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = 0; + if (allow_fallocate_) { + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast<off_t>(offset), static_cast<off_t>(len)); + } + if (alloc_status == 0) { + return IOStatus::OK(); + } else { + return IOError("While fallocate offset " + std::to_string(offset) + + " len " + std::to_string(len), + filename_, errno); + } +} +#endif + +IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& opts, + IODebugContext* dbg) { +#ifdef ROCKSDB_RANGESYNC_PRESENT + assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); + if (sync_file_range_supported_) { + int ret; + if (strict_bytes_per_sync_) { + // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length + // that spans all bytes written so far tells `sync_file_range` to wait for + // any outstanding writeback requests to finish before issuing a new one. + ret = + sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE); + } else { + ret = sync_file_range(fd_, static_cast<off_t>(offset), + static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE); + } + if (ret != 0) { + return IOError("While sync_file_range returned " + std::to_string(ret), + filename_, errno); + } + return IOStatus::OK(); + } +#endif // ROCKSDB_RANGESYNC_PRESENT + return FSWritableFile::RangeSync(offset, nbytes, opts, dbg); +} + +#ifdef OS_LINUX +size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { + return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); +} +#endif + +/* + * PosixRandomRWFile + */ + +PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd, + const EnvOptions& /*options*/) + : filename_(fname), fd_(fd) {} + +PosixRandomRWFile::~PosixRandomRWFile() { + if (fd_ >= 0) { + IOStatus s = Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + const char* src = data.data(); + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) { + return IOError("While write random read/write file at offset " + + std::to_string(offset), + filename_, errno); + } + + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) const { + size_t left = n; + char* ptr = scratch; + while (left > 0) { + ssize_t done = pread(fd_, ptr, left, offset); + if (done < 0) { + // error while reading from file + if (errno == EINTR) { + // read was interrupted, try again. + continue; + } + return IOError("While reading random read/write file offset " + + std::to_string(offset) + " len " + std::to_string(n), + filename_, errno); + } else if (done == 0) { + // Nothing more to read + break; + } + + // Read `done` bytes + ptr += done; + offset += done; + left -= done; + } + + *result = Slice(scratch, n - left); + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fdatasync(fd_) < 0) { + return IOError("While fdatasync random read/write file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC + if (fsync(fd_) < 0) { + return IOError("While fsync random read/write file", filename_, errno); + } +#endif // HAVE_FULLFSYNC + return IOStatus::OK(); +} + +IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + if (close(fd_) < 0) { + return IOError("While close random read/write file", filename_, errno); + } + fd_ = -1; + return IOStatus::OK(); +} + +PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() { + // TODO should have error handling though not much we can do... + munmap(this->base_, length_); +} + +/* + * PosixDirectory + */ +#if !defined(BTRFS_SUPER_MAGIC) +// The magic number for BTRFS is fixed, if it's not defined, define it here +#define BTRFS_SUPER_MAGIC 0x9123683E +#endif +PosixDirectory::PosixDirectory(int fd, const std::string& directory_name) + : fd_(fd), directory_name_(directory_name) { + is_btrfs_ = false; +#ifdef OS_LINUX + struct statfs buf; + int ret = fstatfs(fd, &buf); + is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>( + BTRFS_SUPER_MAGIC)); +#endif +} + +PosixDirectory::~PosixDirectory() { + if (fd_ >= 0) { + IOStatus s = PosixDirectory::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } +} + +IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) { + return FsyncWithDirOptions(opts, dbg, DirFsyncOptions()); +} + +// Users who want the file entries synced in Directory project must call a +// Fsync or FsyncWithDirOptions function before Close +IOStatus PosixDirectory::Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s = IOStatus::OK(); + if (close(fd_) < 0) { + s = IOError("While closing directory ", directory_name_, errno); + } else { + fd_ = -1; + } + return s; +} + +IOStatus PosixDirectory::FsyncWithDirOptions( + const IOOptions& /*opts*/, IODebugContext* /*dbg*/, + const DirFsyncOptions& dir_fsync_options) { + assert(fd_ >= 0); // Check use after close + IOStatus s = IOStatus::OK(); +#ifndef OS_AIX + if (is_btrfs_) { + // skip dir fsync for new file creation, which is not needed for btrfs + if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) { + return s; + } + // skip dir fsync for renaming file, only need to sync new file + if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) { + std::string new_name = dir_fsync_options.renamed_new_name; + assert(!new_name.empty()); + int fd; + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(new_name.c_str(), O_RDONLY); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("While open renaming file", new_name, errno); + } else if (fsync(fd) < 0) { + s = IOError("While fsync renaming file", new_name, errno); + } + if (close(fd) < 0) { + s = IOError("While closing file after fsync", new_name, errno); + } + return s; + } + // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted + } + + // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed + // in either the de-construction or the close function, data must have been + // fsync-ed before de-construction and close is called +#ifdef HAVE_FULLFSYNC + // btrfs is a Linux file system, while currently F_FULLFSYNC is available on + // Mac OS. + assert(!is_btrfs_); + if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno); + } +#else // HAVE_FULLFSYNC + if (fd_ != -1 && fsync(fd_) == -1) { + s = IOError("While fsync", "a directory", errno); + } +#endif // HAVE_FULLFSYNC +#endif // OS_AIX + return s; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/src/rocksdb/env/io_posix.h b/src/rocksdb/env/io_posix.h new file mode 100644 index 000000000..f129668ea --- /dev/null +++ b/src/rocksdb/env/io_posix.h @@ -0,0 +1,523 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include <errno.h> +#if defined(ROCKSDB_IOURING_PRESENT) +#include <liburing.h> +#include <sys/uio.h> +#endif +#include <unistd.h> + +#include <atomic> +#include <functional> +#include <map> +#include <string> + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" +#include "util/thread_local.h" + +// For non linux platform, the following macros are used only as place +// holder. +#if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX) +#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */ + +#define POSIX_MADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_MADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_MADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_MADV_DONTNEED 4 /* [MC1] don't need these pages */ +#endif + +namespace ROCKSDB_NAMESPACE { +std::string IOErrorMsg(const std::string& context, + const std::string& file_name); +// file_name can be left empty if it is not unkown. +IOStatus IOError(const std::string& context, const std::string& file_name, + int err_number); + +class PosixHelper { + public: + static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size); + static size_t GetLogicalBlockSizeOfFd(int fd); + static Status GetLogicalBlockSizeOfDirectory(const std::string& directory, + size_t* size); +}; + +/* + * DirectIOHelper + */ +inline bool IsSectorAligned(const size_t off, size_t sector_size) { + assert((sector_size & (sector_size - 1)) == 0); + return (off & (sector_size - 1)) == 0; +} + +#ifndef NDEBUG +inline bool IsSectorAligned(const void* ptr, size_t sector_size) { + return uintptr_t(ptr) % sector_size == 0; +} +#endif + +#if defined(ROCKSDB_IOURING_PRESENT) +struct Posix_IOHandle { + Posix_IOHandle(struct io_uring* _iu, + std::function<void(const FSReadRequest&, void*)> _cb, + void* _cb_arg, uint64_t _offset, size_t _len, char* _scratch, + bool _use_direct_io, size_t _alignment) + : iu(_iu), + cb(_cb), + cb_arg(_cb_arg), + offset(_offset), + len(_len), + scratch(_scratch), + use_direct_io(_use_direct_io), + alignment(_alignment), + is_finished(false), + req_count(0) {} + + struct iovec iov; + struct io_uring* iu; + std::function<void(const FSReadRequest&, void*)> cb; + void* cb_arg; + uint64_t offset; + size_t len; + char* scratch; + bool use_direct_io; + size_t alignment; + bool is_finished; + // req_count is used by AbortIO API to keep track of number of requests. + uint32_t req_count; +}; + +inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name, + size_t len, size_t iov_len, bool async_read, + bool use_direct_io, size_t alignment, + size_t& finished_len, FSReadRequest* req, + size_t& bytes_read, bool& read_again) { + read_again = false; + if (cqe->res < 0) { + req->result = Slice(req->scratch, 0); + req->status = IOError("Req failed", file_name, cqe->res); + } else { + bytes_read = static_cast<size_t>(cqe->res); + TEST_SYNC_POINT_CALLBACK("UpdateResults::io_uring_result", &bytes_read); + if (bytes_read == iov_len) { + req->result = Slice(req->scratch, req->len); + req->status = IOStatus::OK(); + } else if (bytes_read == 0) { + /// cqe->res == 0 can means EOF, or can mean partial results. See + // comment + // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435 + // Fall back to pread in this case. + if (use_direct_io && !IsSectorAligned(finished_len, alignment)) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + req->result = Slice(req->scratch, finished_len); + req->status = IOStatus::OK(); + } else { + if (async_read) { + // No bytes read. It can means EOF. In case of partial results, it's + // caller responsibility to call read/readasync again. + req->result = Slice(req->scratch, 0); + req->status = IOStatus::OK(); + } else { + read_again = true; + } + } + } else if (bytes_read < iov_len) { + assert(bytes_read > 0); + if (async_read) { + req->result = Slice(req->scratch, bytes_read); + req->status = IOStatus::OK(); + } else { + assert(bytes_read + finished_len < len); + finished_len += bytes_read; + } + } else { + req->result = Slice(req->scratch, 0); + req->status = IOError("Req returned more bytes than requested", file_name, + cqe->res); + } + } +#ifdef NDEBUG + (void)len; +#endif +} +#endif + +#ifdef OS_LINUX +// Files under a specific directory have the same logical block size. +// This class caches the logical block size for the specified directories to +// save the CPU cost of computing the size. +// Safe for concurrent access from multiple threads without any external +// synchronization. +class LogicalBlockSizeCache { + public: + LogicalBlockSizeCache( + std::function<size_t(int)> get_logical_block_size_of_fd = + PosixHelper::GetLogicalBlockSizeOfFd, + std::function<Status(const std::string&, size_t*)> + get_logical_block_size_of_directory = + PosixHelper::GetLogicalBlockSizeOfDirectory) + : get_logical_block_size_of_fd_(get_logical_block_size_of_fd), + get_logical_block_size_of_directory_( + get_logical_block_size_of_directory) {} + + // Takes the following actions: + // 1. Increases reference count of the directories; + // 2. If the directory's logical block size is not cached, + // compute the buffer size and cache the result. + Status RefAndCacheLogicalBlockSize( + const std::vector<std::string>& directories); + + // Takes the following actions: + // 1. Decreases reference count of the directories; + // 2. If the reference count of a directory reaches 0, remove the directory + // from the cache. + void UnrefAndTryRemoveCachedLogicalBlockSize( + const std::vector<std::string>& directories); + + // Returns the logical block size for the file. + // + // If the file is under a cached directory, return the cached size. + // Otherwise, the size is computed. + size_t GetLogicalBlockSize(const std::string& fname, int fd); + + int GetRefCount(const std::string& dir) { + ReadLock lock(&cache_mutex_); + auto it = cache_.find(dir); + if (it == cache_.end()) { + return 0; + } + return it->second.ref; + } + + size_t Size() const { return cache_.size(); } + + bool Contains(const std::string& dir) { + ReadLock lock(&cache_mutex_); + return cache_.find(dir) != cache_.end(); + } + + private: + struct CacheValue { + CacheValue() : size(0), ref(0) {} + + // Logical block size of the directory. + size_t size; + // Reference count of the directory. + int ref; + }; + + std::function<size_t(int)> get_logical_block_size_of_fd_; + std::function<Status(const std::string&, size_t*)> + get_logical_block_size_of_directory_; + + std::map<std::string, CacheValue> cache_; + port::RWMutex cache_mutex_; +}; +#endif + +class PosixSequentialFile : public FSSequentialFile { + private: + std::string filename_; + FILE* file_; + int fd_; + bool use_direct_io_; + size_t logical_sector_size_; + + public: + PosixSequentialFile(const std::string& fname, FILE* file, int fd, + size_t logical_block_size, const EnvOptions& options); + virtual ~PosixSequentialFile(); + + virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) override; + virtual IOStatus PositionedRead(uint64_t offset, size_t n, + const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) override; + virtual IOStatus Skip(uint64_t n) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual bool use_direct_io() const override { return use_direct_io_; } + virtual size_t GetRequiredBufferAlignment() const override { + return logical_sector_size_; + } +}; + +#if defined(ROCKSDB_IOURING_PRESENT) +// io_uring instance queue depth +const unsigned int kIoUringDepth = 256; + +inline void DeleteIOUring(void* p) { + struct io_uring* iu = static_cast<struct io_uring*>(p); + delete iu; +} + +inline struct io_uring* CreateIOUring() { + struct io_uring* new_io_uring = new struct io_uring; + int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0); + if (ret) { + delete new_io_uring; + new_io_uring = nullptr; + } + return new_io_uring; +} +#endif // defined(ROCKSDB_IOURING_PRESENT) + +class PosixRandomAccessFile : public FSRandomAccessFile { + protected: + std::string filename_; + int fd_; + bool use_direct_io_; + size_t logical_sector_size_; +#if defined(ROCKSDB_IOURING_PRESENT) + ThreadLocalPtr* thread_local_io_urings_; +#endif + + public: + PosixRandomAccessFile(const std::string& fname, int fd, + size_t logical_block_size, const EnvOptions& options +#if defined(ROCKSDB_IOURING_PRESENT) + , + ThreadLocalPtr* thread_local_io_urings +#endif + ); + virtual ~PosixRandomAccessFile(); + + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts, + IODebugContext* dbg) override; + +#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +#endif + virtual void Hint(AccessPattern pattern) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual bool use_direct_io() const override { return use_direct_io_; } + virtual size_t GetRequiredBufferAlignment() const override { + return logical_sector_size_; + } + // EXPERIMENTAL + virtual IOStatus ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; +}; + +class PosixWritableFile : public FSWritableFile { + protected: + const std::string filename_; + const bool use_direct_io_; + int fd_; + uint64_t filesize_; + size_t logical_sector_size_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool allow_fallocate_; + bool fallocate_with_keep_size_; +#endif +#ifdef ROCKSDB_RANGESYNC_PRESENT + // Even if the syscall is present, the filesystem may still not properly + // support it, so we need to do a dynamic check too. + bool sync_file_range_supported_; +#endif // ROCKSDB_RANGESYNC_PRESENT + + public: + explicit PosixWritableFile(const std::string& fname, int fd, + size_t logical_block_size, + const EnvOptions& options); + virtual ~PosixWritableFile(); + + // Need to implement this so the file is truncated correctly + // with direct I/O + virtual IOStatus Truncate(uint64_t size, const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } + virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual bool IsSyncThreadSafe() const override; + virtual bool use_direct_io() const override { return use_direct_io_; } + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override; + virtual uint64_t GetFileSize(const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual size_t GetRequiredBufferAlignment() const override { + return logical_sector_size_; + } +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual IOStatus Allocate(uint64_t offset, uint64_t len, + const IOOptions& opts, + IODebugContext* dbg) override; +#endif + virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& opts, + IODebugContext* dbg) override; +#ifdef OS_LINUX + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +#endif +}; + +// mmap() based random-access +class PosixMmapReadableFile : public FSRandomAccessFile { + private: + int fd_; + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + PosixMmapReadableFile(const int fd, const std::string& fname, void* base, + size_t length, const EnvOptions& options); + virtual ~PosixMmapReadableFile(); + IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, + char* scratch, IODebugContext* dbg) const override; + void Hint(AccessPattern pattern) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; +}; + +class PosixMmapFile : public FSWritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool allow_fallocate_; // If false, fallocate calls are bypassed + bool fallocate_with_keep_size_; +#endif + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + IOStatus MapNewRegion(); + IOStatus UnmapCurrentRegion(); + IOStatus Msync(); + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options); + ~PosixMmapFile(); + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual uint64_t GetFileSize(const IOOptions& opts, + IODebugContext* dbg) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual IOStatus Allocate(uint64_t offset, uint64_t len, + const IOOptions& opts, + IODebugContext* dbg) override; +#endif +}; + +class PosixRandomRWFile : public FSRandomRWFile { + public: + explicit PosixRandomRWFile(const std::string& fname, int fd, + const EnvOptions& options); + virtual ~PosixRandomRWFile(); + + virtual IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& opts, IODebugContext* dbg) override; + + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + + private: + const std::string filename_; + int fd_; +}; + +struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer { + PosixMemoryMappedFileBuffer(void* _base, size_t _length) + : MemoryMappedFileBuffer(_base, _length) {} + virtual ~PosixMemoryMappedFileBuffer(); +}; + +class PosixDirectory : public FSDirectory { + public: + explicit PosixDirectory(int fd, const std::string& directory_name); + ~PosixDirectory(); + virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + + virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; + + virtual IOStatus FsyncWithDirOptions( + const IOOptions&, IODebugContext*, + const DirFsyncOptions& dir_fsync_options) override; + + private: + int fd_; + bool is_btrfs_; + const std::string directory_name_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/io_posix_test.cc b/src/rocksdb/env/io_posix_test.cc new file mode 100644 index 000000000..81ce50587 --- /dev/null +++ b/src/rocksdb/env/io_posix_test.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testharness.h" + +#ifdef ROCKSDB_LIB_IO_POSIX +#include "env/io_posix.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef OS_LINUX +class LogicalBlockSizeCacheTest : public testing::Test {}; + +// Tests the caching behavior. +TEST_F(LogicalBlockSizeCacheTest, Cache) { + int ncall = 0; + auto get_fd_block_size = [&](int fd) { + ncall++; + return fd; + }; + std::map<std::string, int> dir_fds{ + {"/", 0}, + {"/db", 1}, + {"/db1", 2}, + {"/db2", 3}, + }; + auto get_dir_block_size = [&](const std::string& dir, size_t* size) { + ncall++; + *size = dir_fds[dir]; + return Status::OK(); + }; + LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size); + ASSERT_EQ(0, ncall); + ASSERT_EQ(0, cache.Size()); + + ASSERT_EQ(6, cache.GetLogicalBlockSize("/sst", 6)); + ASSERT_EQ(1, ncall); + ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7)); + ASSERT_EQ(2, ncall); + ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8)); + ASSERT_EQ(3, ncall); + + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/", "/db1/", "/db2"})); + ASSERT_EQ(3, cache.Size()); + ASSERT_TRUE(cache.Contains("/")); + ASSERT_TRUE(cache.Contains("/db1")); + ASSERT_TRUE(cache.Contains("/db2")); + ASSERT_EQ(6, ncall); + // Block size for / is cached. + ASSERT_EQ(0, cache.GetLogicalBlockSize("/sst", 6)); + ASSERT_EQ(6, ncall); + // No cached size for /db. + ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7)); + ASSERT_EQ(7, ncall); + ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8)); + ASSERT_EQ(8, ncall); + // Block size for /db1 is cached. + ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst1", 4)); + ASSERT_EQ(8, ncall); + ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst2", 5)); + ASSERT_EQ(8, ncall); + // Block size for /db2 is cached. + ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst1", 6)); + ASSERT_EQ(8, ncall); + ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst2", 7)); + ASSERT_EQ(8, ncall); + + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"})); + ASSERT_EQ(4, cache.Size()); + ASSERT_TRUE(cache.Contains("/")); + ASSERT_TRUE(cache.Contains("/db1")); + ASSERT_TRUE(cache.Contains("/db2")); + ASSERT_TRUE(cache.Contains("/db")); + + ASSERT_EQ(9, ncall); + // Block size for /db is cached. + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst1", 7)); + ASSERT_EQ(9, ncall); + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst2", 8)); + ASSERT_EQ(9, ncall); +} + +// Tests the reference counting behavior. +TEST_F(LogicalBlockSizeCacheTest, Ref) { + int ncall = 0; + auto get_fd_block_size = [&](int fd) { + ncall++; + return fd; + }; + std::map<std::string, int> dir_fds{ + {"/db", 0}, + }; + auto get_dir_block_size = [&](const std::string& dir, size_t* size) { + ncall++; + *size = dir_fds[dir]; + return Status::OK(); + }; + LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size); + + ASSERT_EQ(0, ncall); + + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1)); + ASSERT_EQ(1, ncall); + + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"})); + ASSERT_EQ(2, ncall); + ASSERT_EQ(1, cache.GetRefCount("/db")); + // Block size for /db is cached. Ref count = 1. + ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst1", 1)); + ASSERT_EQ(2, ncall); + + // Ref count = 2, but won't recompute the cached buffer size. + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"})); + ASSERT_EQ(2, cache.GetRefCount("/db")); + ASSERT_EQ(2, ncall); + + // Ref count = 1. + cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"}); + ASSERT_EQ(1, cache.GetRefCount("/db")); + // Block size for /db is still cached. + ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst2", 1)); + ASSERT_EQ(2, ncall); + + // Ref count = 0 and cached buffer size for /db is removed. + cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"}); + ASSERT_EQ(0, cache.Size()); + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1)); + ASSERT_EQ(3, ncall); +} +#endif + +} // namespace ROCKSDB_NAMESPACE +#endif + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/env/mock_env.cc b/src/rocksdb/env/mock_env.cc new file mode 100644 index 000000000..bfa7dc2f4 --- /dev/null +++ b/src/rocksdb/env/mock_env.cc @@ -0,0 +1,1070 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "env/mock_env.h" + +#include <algorithm> +#include <chrono> + +#include "env/emulated_clock.h" +#include "file/filename.h" +#include "port/sys_time.h" +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/options_type.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +int64_t MaybeCurrentTime(const std::shared_ptr<SystemClock>& clock) { + int64_t time = 1337346000; // arbitrary fallback default + clock->GetCurrentTime(&time).PermitUncheckedError(); + return time; +} + +static std::unordered_map<std::string, OptionTypeInfo> time_elapse_type_info = { +#ifndef ROCKSDB_LITE + {"time_elapse_only_sleep", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto clock = static_cast<EmulatedSystemClock*>(addr); + clock->SetTimeElapseOnlySleep(ParseBoolean("", value)); + return Status::OK(); + }, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto clock = static_cast<const EmulatedSystemClock*>(addr); + *value = clock->IsTimeElapseOnlySleep() ? "true" : "false"; + return Status::OK(); + }, + nullptr}}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map<std::string, OptionTypeInfo> mock_sleep_type_info = { +#ifndef ROCKSDB_LITE + {"mock_sleep", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto clock = static_cast<EmulatedSystemClock*>(addr); + clock->SetMockSleep(ParseBoolean("", value)); + return Status::OK(); + }, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto clock = static_cast<const EmulatedSystemClock*>(addr); + *value = clock->IsMockSleepEnabled() ? "true" : "false"; + return Status::OK(); + }, + nullptr}}, +#endif // ROCKSDB_LITE +}; +} // namespace + +EmulatedSystemClock::EmulatedSystemClock( + const std::shared_ptr<SystemClock>& base, bool time_elapse_only_sleep) + : SystemClockWrapper(base), + maybe_starting_time_(MaybeCurrentTime(base)), + time_elapse_only_sleep_(time_elapse_only_sleep), + no_slowdown_(time_elapse_only_sleep) { + RegisterOptions("", this, &time_elapse_type_info); + RegisterOptions("", this, &mock_sleep_type_info); +} + +class MemFile { + public: + explicit MemFile(SystemClock* clock, const std::string& fn, + bool _is_lock_file = false) + : clock_(clock), + fn_(fn), + refs_(0), + is_lock_file_(_is_lock_file), + locked_(false), + size_(0), + modified_time_(Now()), + rnd_(Lower32of64(GetSliceNPHash64(fn))), + fsynced_bytes_(0) {} + // No copying allowed. + MemFile(const MemFile&) = delete; + void operator=(const MemFile&) = delete; + + void Ref() { + MutexLock lock(&mutex_); + ++refs_; + } + + bool is_lock_file() const { return is_lock_file_; } + + bool Lock() { + assert(is_lock_file_); + MutexLock lock(&mutex_); + if (locked_) { + return false; + } else { + locked_ = true; + return true; + } + } + + void Unlock() { + assert(is_lock_file_); + MutexLock lock(&mutex_); + locked_ = false; + } + + void Unref() { + bool do_delete = false; + { + MutexLock lock(&mutex_); + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + do_delete = true; + } + } + + if (do_delete) { + delete this; + } + } + + uint64_t Size() const { return size_; } + + void Truncate(size_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + if (size < size_) { + data_.resize(size); + size_ = size; + } + } + + void CorruptBuffer() { + if (fsynced_bytes_ >= size_) { + return; + } + uint64_t buffered_bytes = size_ - fsynced_bytes_; + uint64_t start = + fsynced_bytes_ + rnd_.Uniform(static_cast<int>(buffered_bytes)); + uint64_t end = std::min(start + 512, size_.load()); + MutexLock lock(&mutex_); + for (uint64_t pos = start; pos < end; ++pos) { + data_[static_cast<size_t>(pos)] = static_cast<char>(rnd_.Uniform(256)); + } + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, IODebugContext* /*dbg*/) const { + { + IOStatus s; + TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s); + if (!s.ok()) { + // with sync point only + *result = Slice(); + return s; + } + } + MutexLock lock(&mutex_); + const uint64_t available = Size() - std::min(Size(), offset); + size_t offset_ = static_cast<size_t>(offset); + if (n > available) { + n = static_cast<size_t>(available); + } + if (n == 0) { + *result = Slice(); + return IOStatus::OK(); + } + if (scratch) { + memcpy(scratch, &(data_[offset_]), n); + *result = Slice(scratch, n); + } else { + *result = Slice(&(data_[offset_]), n); + } + return IOStatus::OK(); + } + + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + size_t offset_ = static_cast<size_t>(offset); + if (offset + data.size() > data_.size()) { + data_.resize(offset_ + data.size()); + } + data_.replace(offset_, data.size(), data.data(), data.size()); + size_ = data_.size(); + modified_time_ = Now(); + return IOStatus::OK(); + } + + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + data_.append(data.data(), data.size()); + size_ = data_.size(); + modified_time_ = Now(); + return IOStatus::OK(); + } + + IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) { + fsynced_bytes_ = size_.load(); + return IOStatus::OK(); + } + + uint64_t ModifiedTime() const { return modified_time_; } + + private: + uint64_t Now() { + int64_t unix_time = 0; + auto s = clock_->GetCurrentTime(&unix_time); + assert(s.ok()); + return static_cast<uint64_t>(unix_time); + } + + // Private since only Unref() should be used to delete it. + ~MemFile() { assert(refs_ == 0); } + + SystemClock* clock_; + const std::string fn_; + mutable port::Mutex mutex_; + int refs_; + bool is_lock_file_; + bool locked_; + + // Data written into this file, all bytes before fsynced_bytes are + // persistent. + std::string data_; + std::atomic<uint64_t> size_; + std::atomic<uint64_t> modified_time_; + + Random rnd_; + std::atomic<uint64_t> fsynced_bytes_; +}; + +namespace { + +class MockSequentialFile : public FSSequentialFile { + public: + explicit MockSequentialFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads), + pos_(0) { + file_->Ref(); + } + + ~MockSequentialFile() override { file_->Unref(); } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + IOStatus s = file_->Read(pos_, n, options, result, + (use_mmap_read_) ? nullptr : scratch, dbg); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + bool use_direct_io() const override { return use_direct_io_; } + IOStatus Skip(uint64_t n) override { + if (pos_ > file_->Size()) { + return IOStatus::IOError("pos_ > file_->Size()"); + } + const uint64_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += static_cast<size_t>(n); + return IOStatus::OK(); + } + + private: + MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; + size_t pos_; +}; + +class MockRandomAccessFile : public FSRandomAccessFile { + public: + explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads) { + file_->Ref(); + } + + ~MockRandomAccessFile() override { file_->Unref(); } + + bool use_direct_io() const override { return use_direct_io_; } + + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + if (use_mmap_read_) { + return file_->Read(offset, n, options, result, nullptr, dbg); + } else { + return file_->Read(offset, n, options, result, scratch, dbg); + } + } + + private: + MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; +}; + +class MockRandomRWFile : public FSRandomRWFile { + public: + explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); } + + ~MockRandomRWFile() override { file_->Unref(); } + + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return file_->Write(offset, data, options, dbg); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return file_->Read(offset, n, options, result, scratch, dbg); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + private: + MemFile* file_; +}; + +class MockWritableFile : public FSWritableFile { + public: + MockWritableFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_writes), + rate_limiter_(opts.rate_limiter) { + file_->Ref(); + } + + ~MockWritableFile() override { file_->Unref(); } + + bool use_direct_io() const override { return false && use_direct_io_; } + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + size_t bytes_written = 0; + while (bytes_written < data.size()) { + auto bytes = RequestToken(data.size() - bytes_written); + IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes), + options, dbg); + if (!s.ok()) { + return s; + } + bytes_written += bytes; + } + return IOStatus::OK(); + } + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/, + const IOOptions& options, + IODebugContext* dbg) override { + assert(use_direct_io_); + return Append(data, options, dbg); + } + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + file_->Truncate(static_cast<size_t>(size), options, dbg); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return file_->Size(); + } + + private: + inline size_t RequestToken(size_t bytes) { + if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) { + bytes = std::min( + bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes())); + rate_limiter_->Request(bytes, io_priority_); + } + return bytes; + } + + MemFile* file_; + bool use_direct_io_; + RateLimiter* rate_limiter_; +}; + +class MockEnvDirectory : public FSDirectory { + public: + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } +}; + +class MockEnvFileLock : public FileLock { + public: + explicit MockEnvFileLock(const std::string& fname) : fname_(fname) {} + + std::string FileName() const { return fname_; } + + private: + const std::string fname_; +}; + +class TestMemLogger : public Logger { + private: + std::unique_ptr<FSWritableFile> file_; + std::atomic_size_t log_size_; + static const uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + SystemClock* clock_; + IOOptions options_; + IODebugContext* dbg_; + std::atomic<bool> flush_pending_; + + public: + TestMemLogger(std::unique_ptr<FSWritableFile> f, SystemClock* clock, + const IOOptions& options, IODebugContext* dbg, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(std::move(f)), + log_size_(0), + last_flush_micros_(0), + clock_(clock), + options_(options), + dbg_(dbg), + flush_pending_(false) {} + ~TestMemLogger() override {} + + void Flush() override { + if (flush_pending_) { + flush_pending_ = false; + } + last_flush_micros_ = clock_->NowMicros(); + } + + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + memset(&t, 0, sizeof(t)); + struct tm* ret __attribute__((__unused__)); + ret = port::LocalTimeR(&seconds, &t); + assert(ret); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + + Status s = file_->Append(Slice(base, write_size), options_, dbg_); + if (s.ok()) { + flush_pending_ = true; + log_size_ += write_size; + } + uint64_t now_micros = + static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + last_flush_micros_ = now_micros; + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const override { return log_size_; } +}; + +static std::unordered_map<std::string, OptionTypeInfo> mock_fs_type_info = { +#ifndef ROCKSDB_LITE + {"supports_direct_io", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +} // namespace + +MockFileSystem::MockFileSystem(const std::shared_ptr<SystemClock>& clock, + bool supports_direct_io) + : system_clock_(clock), supports_direct_io_(supports_direct_io) { + clock_ = system_clock_.get(); + RegisterOptions("", &supports_direct_io_, &mock_fs_type_info); +} + +MockFileSystem::~MockFileSystem() { + for (auto i = file_map_.begin(); i != file_map_.end(); ++i) { + i->second->Unref(); + } +} + +Status MockFileSystem::PrepareOptions(const ConfigOptions& options) { + Status s = FileSystem::PrepareOptions(options); + if (s.ok() && system_clock_ == SystemClock::Default()) { + system_clock_ = options.env->GetSystemClock(); + clock_ = system_clock_.get(); + } + return s; +} + +IOStatus MockFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) { + *output_path = NormalizeMockPath(db_path); + if (output_path->at(0) != '/') { + return IOStatus::NotSupported("GetAbsolutePath"); + } else { + return IOStatus::OK(); + } +} + +std::string MockFileSystem::NormalizeMockPath(const std::string& path) { + std::string p = NormalizePath(path); + if (p.back() == kFilePathSeparator && p.size() > 1) { + p.pop_back(); + } + return p; +} + +// Partial implementation of the FileSystem interface. +IOStatus MockFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = nullptr; + return IOStatus::PathNotFound(fn); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockSequentialFile(f, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = nullptr; + return IOStatus::PathNotFound(fn); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockRandomAccessFile(f, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& /*file_opts*/, + std::unique_ptr<FSRandomRWFile>* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = nullptr; + return IOStatus::PathNotFound(fn); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } + result->reset(new MockRandomRWFile(f)); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) { + auto s = RenameFile(old_fname, fname, IOOptions(), dbg); + if (!s.ok()) { + return s; + } else { + result->reset(); + return NewWritableFile(fname, options, result, dbg); + } +} + +IOStatus MockFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + DeleteFileInternal(fn); + } + MemFile* file = new MemFile(clock_, fn, false); + file->Ref(); + file_map_[fn] = file; + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + MemFile* file = nullptr; + if (file_map_.find(fn) == file_map_.end()) { + file = new MemFile(clock_, fn, false); + // Only take a reference when we create the file objectt + file->Ref(); + file_map_[fn] = file; + } else { + file = file_map_[fn]; + } + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/, + const IOOptions& /*io_opts*/, + std::unique_ptr<FSDirectory>* result, + IODebugContext* /*dbg*/) { + result->reset(new MockEnvDirectory()); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::FileExists(const std::string& fname, + const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + // File exists + return IOStatus::OK(); + } + // Now also check if fn exists as a dir + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' && + Slice(filename).starts_with(Slice(fn))) { + return IOStatus::OK(); + } + } + return IOStatus::NotFound(); +} + +bool MockFileSystem::GetChildrenInternal(const std::string& dir, + std::vector<std::string>* result) { + auto d = NormalizeMockPath(dir); + bool found_dir = false; + result->clear(); + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + + if (filename == d) { + found_dir = true; + } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && + Slice(filename).starts_with(Slice(d))) { + found_dir = true; + size_t next_slash = filename.find('/', d.size() + 1); + if (next_slash != std::string::npos) { + result->push_back( + filename.substr(d.size() + 1, next_slash - d.size() - 1)); + } else { + result->push_back(filename.substr(d.size() + 1)); + } + } + } + result->erase(std::unique(result->begin(), result->end()), result->end()); + return found_dir; +} + +IOStatus MockFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*options*/, + std::vector<std::string>* result, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + bool found_dir = GetChildrenInternal(dir, result); +#ifndef __clang_analyzer__ + return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir); +#else + return found_dir ? IOStatus::OK() : IOStatus::NotFound(); +#endif +} + +void MockFileSystem::DeleteFileInternal(const std::string& fname) { + assert(fname == NormalizeMockPath(fname)); + const auto& pair = file_map_.find(fname); + if (pair != file_map_.end()) { + pair->second->Unref(); + file_map_.erase(fname); + } +} + +IOStatus MockFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + + DeleteFileInternal(fn); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& options, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + iter->second->Truncate(size, options, dbg); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::CreateDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dn = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); + if (file_map_.find(dn) == file_map_.end()) { + MemFile* file = new MemFile(clock_, dn, false); + file->Ref(); + file_map_[dn] = file; + } else { + return IOStatus::IOError(); + } + return IOStatus::OK(); +} + +IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + CreateDir(dirname, options, dbg).PermitUncheckedError(); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dir = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); + if (file_map_.find(dir) == file_map_.end()) { + return IOStatus::PathNotFound(dir); + } else { + std::vector<std::string> children; + if (GetChildrenInternal(dir, &children)) { + for (const auto& child : children) { + DeleteFileInternal(child); + } + } + DeleteFileInternal(dir); + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_size, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + TEST_SYNC_POINT_CALLBACK("MockFileSystem::GetFileSize:CheckFileType", &fn); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + + *file_size = iter->second->Size(); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* time, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return IOStatus::PathNotFound(fn); + } + *time = iter->second->ModifiedTime(); + return IOStatus::OK(); +} + +bool MockFileSystem::RenameFileInternal(const std::string& src, + const std::string& dest) { + if (file_map_.find(src) == file_map_.end()) { + return false; + } else { + std::vector<std::string> children; + if (GetChildrenInternal(src, &children)) { + for (const auto& child : children) { + RenameFileInternal(src + "/" + child, dest + "/" + child); + } + } + DeleteFileInternal(dest); + file_map_[dest] = file_map_[src]; + file_map_.erase(src); + return true; + } +} + +IOStatus MockFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); + MutexLock lock(&mutex_); + bool found = RenameFileInternal(s, t); + if (!found) { + return IOStatus::PathNotFound(s); + } else { + return IOStatus::OK(); + } +} + +IOStatus MockFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); + MutexLock lock(&mutex_); + if (file_map_.find(s) == file_map_.end()) { + return IOStatus::PathNotFound(s); + } + + DeleteFileInternal(t); + file_map_[t] = file_map_[s]; + file_map_[t]->Ref(); // Otherwise it might get deleted when noone uses s + return IOStatus::OK(); +} + +IOStatus MockFileSystem::NewLogger(const std::string& fname, + const IOOptions& io_opts, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + MemFile* file = nullptr; + if (iter == file_map_.end()) { + file = new MemFile(clock_, fn, false); + file->Ref(); + file_map_[fn] = file; + } else { + file = iter->second; + } + std::unique_ptr<FSWritableFile> f(new MockWritableFile(file, FileOptions())); + result->reset(new TestMemLogger(std::move(f), clock_, io_opts, dbg)); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::LockFile(const std::string& fname, + const IOOptions& /*options*/, + FileLock** flock, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + { + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + if (!file_map_[fn]->is_lock_file()) { + return IOStatus::InvalidArgument(fname, "Not a lock file."); + } + if (!file_map_[fn]->Lock()) { + return IOStatus::IOError(fn, "lock is already held."); + } + } else { + auto* file = new MemFile(clock_, fn, true); + file->Ref(); + file->Lock(); + file_map_[fn] = file; + } + } + *flock = new MockEnvFileLock(fn); + return IOStatus::OK(); +} + +IOStatus MockFileSystem::UnlockFile(FileLock* flock, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + std::string fn = static_cast_with_check<MockEnvFileLock>(flock)->FileName(); + { + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + if (!file_map_[fn]->is_lock_file()) { + return IOStatus::InvalidArgument(fn, "Not a lock file."); + } + file_map_[fn]->Unlock(); + } + } + delete flock; + return IOStatus::OK(); +} + +IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/, + std::string* path, + IODebugContext* /*dbg*/) { + *path = "/test"; + return IOStatus::OK(); +} + +Status MockFileSystem::CorruptBuffer(const std::string& fname) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + iter->second->CorruptBuffer(); + return Status::OK(); +} + +MockEnv::MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& clock) + : CompositeEnvWrapper(env, fs, clock) {} + +MockEnv* MockEnv::Create(Env* env) { + auto clock = + std::make_shared<EmulatedSystemClock>(env->GetSystemClock(), true); + return MockEnv::Create(env, clock); +} + +MockEnv* MockEnv::Create(Env* env, const std::shared_ptr<SystemClock>& clock) { + auto fs = std::make_shared<MockFileSystem>(clock); + return new MockEnv(env, fs, clock); +} + +Status MockEnv::CorruptBuffer(const std::string& fname) { + auto mock = static_cast_with_check<MockFileSystem>(GetFileSystem().get()); + return mock->CorruptBuffer(fname); +} + +#ifndef ROCKSDB_LITE +// This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv +Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); } + +#else // ROCKSDB_LITE + +Env* NewMemEnv(Env* /*base_env*/) { return nullptr; } + +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/mock_env.h b/src/rocksdb/env/mock_env.h new file mode 100644 index 000000000..406a31f63 --- /dev/null +++ b/src/rocksdb/env/mock_env.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <atomic> +#include <map> +#include <string> +#include <vector> + +#include "env/composite_env_wrapper.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +class MemFile; +class MockFileSystem : public FileSystem { + public: + explicit MockFileSystem(const std::shared_ptr<SystemClock>& clock, + bool supports_direct_io = true); + ~MockFileSystem() override; + + static const char* kClassName() { return "MemoryFileSystem"; } + const char* Name() const override { return kClassName(); } + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* r, + IODebugContext* dbg) override; + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* r, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* dbg) override; + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector<std::string>* result, + IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override; + IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& /*options*/, bool* /*is_dir*/, + IODebugContext* /*dgb*/) override { + return IOStatus::NotSupported("IsDirectory"); + } + + Status CorruptBuffer(const std::string& fname); + Status PrepareOptions(const ConfigOptions& options) override; + + private: + bool RenameFileInternal(const std::string& src, const std::string& dest); + void DeleteFileInternal(const std::string& fname); + bool GetChildrenInternal(const std::string& fname, + std::vector<std::string>* results); + + std::string NormalizeMockPath(const std::string& path); + + private: + // Map from filenames to MemFile objects, representing a simple file system. + port::Mutex mutex_; + std::map<std::string, MemFile*> file_map_; // Protected by mutex_. + std::shared_ptr<SystemClock> system_clock_; + SystemClock* clock_; + bool supports_direct_io_; +}; + +class MockEnv : public CompositeEnvWrapper { + public: + static MockEnv* Create(Env* base); + static MockEnv* Create(Env* base, const std::shared_ptr<SystemClock>& clock); + + static const char* kClassName() { return "MockEnv"; } + const char* Name() const override { return kClassName(); } + + Status CorruptBuffer(const std::string& fname); + + private: + MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs, + const std::shared_ptr<SystemClock>& clock); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/mock_env_test.cc b/src/rocksdb/env/mock_env_test.cc new file mode 100644 index 000000000..be174bd73 --- /dev/null +++ b/src/rocksdb/env/mock_env_test.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include "env/mock_env.h" + +#include <memory> +#include <string> + +#include "rocksdb/env.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class MockEnvTest : public testing::Test { + public: + MockEnv* env_; + const EnvOptions soptions_; + + MockEnvTest() : env_(MockEnv::Create(Env::Default())) {} + ~MockEnvTest() override { delete env_; } +}; + +TEST_F(MockEnvTest, Corrupt) { + const std::string kGood = "this is a good string, synced to disk"; + const std::string kCorrupted = "this part may be corrupted"; + const std::string kFileName = "/dir/f"; + std::unique_ptr<WritableFile> writable_file; + ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_)); + ASSERT_OK(writable_file->Append(kGood)); + ASSERT_TRUE(writable_file->GetFileSize() == kGood.size()); + + std::string scratch; + scratch.resize(kGood.size() + kCorrupted.size() + 16); + Slice result; + std::unique_ptr<RandomAccessFile> rand_file; + ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0]))); + ASSERT_EQ(result.compare(kGood), 0); + + // Sync + corrupt => no change + ASSERT_OK(writable_file->Fsync()); + ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName)); + result.clear(); + ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0]))); + ASSERT_EQ(result.compare(kGood), 0); + + // Add new data and corrupt it + ASSERT_OK(writable_file->Append(kCorrupted)); + ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size()); + result.clear(); + ASSERT_OK( + rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0]))); + ASSERT_EQ(result.compare(kCorrupted), 0); + // Corrupted + ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName)); + result.clear(); + ASSERT_OK( + rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0]))); + ASSERT_NE(result.compare(kCorrupted), 0); +} + +TEST_F(MockEnvTest, FakeSleeping) { + int64_t now = 0; + auto s = env_->GetCurrentTime(&now); + ASSERT_OK(s); + env_->SleepForMicroseconds(3 * 1000 * 1000); + int64_t after_sleep = 0; + s = env_->GetCurrentTime(&after_sleep); + ASSERT_OK(s); + auto delta = after_sleep - now; + // this will be true unless test runs for 2 seconds + ASSERT_TRUE(delta == 3 || delta == 4); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/env/unique_id_gen.cc b/src/rocksdb/env/unique_id_gen.cc new file mode 100644 index 000000000..a1986fa15 --- /dev/null +++ b/src/rocksdb/env/unique_id_gen.cc @@ -0,0 +1,164 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "env/unique_id_gen.h" + +#include <algorithm> +#include <array> +#include <cstring> +#include <random> + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/version.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +struct GenerateRawUniqueIdOpts { + Env* env = Env::Default(); + bool exclude_port_uuid = false; + bool exclude_env_details = false; + bool exclude_random_device = false; +}; + +// Each of these "tracks" below should be sufficient for generating 128 bits +// of entropy, after hashing the raw bytes. The tracks are separable for +// testing purposes, but in production we combine as many tracks as possible +// to ensure quality results even if some environments have degraded +// capabilities or quality in some APIs. +// +// This approach has not been validated for use in cryptography. The goal is +// generating globally unique values with high probability without coordination +// between instances. +// +// Linux performance: EntropyTrackRandomDevice is much faster than +// EntropyTrackEnvDetails, which is much faster than EntropyTrackPortUuid. + +struct EntropyTrackPortUuid { + std::array<char, 36> uuid; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_port_uuid) { + return; + } + std::string s; + port::GenerateRfcUuid(&s); + if (s.size() >= uuid.size()) { + std::copy_n(s.begin(), uuid.size(), uuid.begin()); + } + } +}; + +struct EntropyTrackEnvDetails { + std::array<char, 64> hostname_buf; + int64_t process_id; + uint64_t thread_id; + int64_t unix_time; + uint64_t nano_time; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_env_details) { + return; + } + opts.env->GetHostName(hostname_buf.data(), hostname_buf.size()) + .PermitUncheckedError(); + process_id = port::GetProcessID(); + thread_id = opts.env->GetThreadID(); + opts.env->GetCurrentTime(&unix_time).PermitUncheckedError(); + nano_time = opts.env->NowNanos(); + } +}; + +struct EntropyTrackRandomDevice { + using RandType = std::random_device::result_type; + static constexpr size_t kNumRandVals = + /* generous bits */ 192U / (8U * sizeof(RandType)); + std::array<RandType, kNumRandVals> rand_vals; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_random_device) { + return; + } + std::random_device r; + for (auto& val : rand_vals) { + val = r(); + } + } +}; + +struct Entropy { + uint64_t version_identifier; + EntropyTrackRandomDevice et1; + EntropyTrackEnvDetails et2; + EntropyTrackPortUuid et3; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + // If we change the format of what goes into the entropy inputs, it's + // conceivable there could be a physical collision in the hash input + // even though they are logically different. This value should change + // if there's a change to the "schema" here, including byte order. + version_identifier = (uint64_t{ROCKSDB_MAJOR} << 32) + + (uint64_t{ROCKSDB_MINOR} << 16) + + uint64_t{ROCKSDB_PATCH}; + et1.Populate(opts); + et2.Populate(opts); + et3.Populate(opts); + } +}; + +void GenerateRawUniqueIdImpl(uint64_t* a, uint64_t* b, + const GenerateRawUniqueIdOpts& opts) { + Entropy e; + std::memset(&e, 0, sizeof(e)); + e.Populate(opts); + Hash2x64(reinterpret_cast<const char*>(&e), sizeof(e), a, b); +} + +} // namespace + +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid) { + GenerateRawUniqueIdOpts opts; + opts.exclude_port_uuid = exclude_port_uuid; + assert(!opts.exclude_env_details); + assert(!opts.exclude_random_device); + GenerateRawUniqueIdImpl(a, b, opts); +} + +#ifndef NDEBUG +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device) { + GenerateRawUniqueIdOpts opts; + opts.exclude_port_uuid = exclude_port_uuid; + opts.exclude_env_details = exclude_env_details; + opts.exclude_random_device = exclude_random_device; + GenerateRawUniqueIdImpl(a, b, opts); +} +#endif + +void SemiStructuredUniqueIdGen::Reset() { + saved_process_id_ = port::GetProcessID(); + GenerateRawUniqueId(&base_upper_, &base_lower_); + counter_ = 0; +} + +void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) { + if (port::GetProcessID() == saved_process_id_) { + // Safe to increment the atomic for guaranteed uniqueness within this + // process lifetime. Xor slightly better than +. See + // https://github.com/pdillinger/unique_id + *lower = base_lower_ ^ counter_.fetch_add(1); + *upper = base_upper_; + } else { + // There must have been a fork() or something. Rather than attempting to + // update in a thread-safe way, simply fall back on GenerateRawUniqueId. + GenerateRawUniqueId(upper, lower); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/env/unique_id_gen.h b/src/rocksdb/env/unique_id_gen.h new file mode 100644 index 000000000..17e71e622 --- /dev/null +++ b/src/rocksdb/env/unique_id_gen.h @@ -0,0 +1,71 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file is for functions that generate unique identifiers by +// (at least in part) by extracting novel entropy or sources of uniqueness +// from the execution environment. (By contrast, random.h is for algorithmic +// pseudorandomness.) +// +// These functions could eventually migrate to public APIs, such as in Env. + +#pragma once + +#include <atomic> +#include <cstdint> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Generates a new 128-bit identifier that is universally unique +// (with high probability) for each call. The result is split into +// two 64-bit pieces. This function has NOT been validated for use in +// cryptography. +// +// This is used in generating DB session IDs and by Env::GenerateUniqueId +// (used for DB IDENTITY) if the platform does not provide a generator of +// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this +// function is used as a fallback for GenerateRfcUuid, because no need +// trying it again.) +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, + bool exclude_port_uuid = false); + +#ifndef NDEBUG +// A version of above with options for challenge testing +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device); +#endif + +// Generates globally unique ids with lower probability of any collisions +// vs. each unique id being independently random (GenerateRawUniqueId). +// We call this "semi-structured" because between different +// SemiStructuredUniqueIdGen objects, the IDs are separated by random +// intervals (unstructured), but within a single SemiStructuredUniqueIdGen +// object, the generated IDs are trivially related (structured). See +// https://github.com/pdillinger/unique_id for how this improves probability +// of no collision. In short, if we have n SemiStructuredUniqueIdGen +// objects each generating m IDs, the first collision is expected at +// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64, +// rather than n * m = 2^64 for fully random IDs. +class SemiStructuredUniqueIdGen { + public: + // Initializes with random starting state (from GenerateRawUniqueId) + SemiStructuredUniqueIdGen() { Reset(); } + // Re-initializes, but not thread safe + void Reset(); + + // Assuming no fork(), `lower` is guaranteed unique from one call + // to the next (thread safe). + void GenerateNext(uint64_t* upper, uint64_t* lower); + + private: + uint64_t base_upper_; + uint64_t base_lower_; + std::atomic<uint64_t> counter_; + int64_t saved_process_id_; +}; + +} // namespace ROCKSDB_NAMESPACE |