diff options
Diffstat (limited to 'src/rocksdb/db/compaction')
19 files changed, 12206 insertions, 0 deletions
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc new file mode 100644 index 000000000..5c34fdcaa --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <cinttypes> +#include <vector> + +#include "db/column_family.h" +#include "db/compaction/compaction.h" +#include "rocksdb/compaction_filter.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->Compare(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + +uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->fd.GetFileSize(); + } + return sum; +} + +void Compaction::SetInputVersion(Version* _input_version) { + input_version_ = _input_version; + cfd_ = input_version_->cfd(); + + cfd_->Ref(); + input_version_->Ref(); + edit_.SetColumnFamily(cfd_->GetID()); +} + +void Compaction::GetBoundaryKeys( + VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key, + Slice* largest_user_key) { + bool initialized = false; + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty()) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + const Slice& start_user_key = f->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = f->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = inputs[i].files[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = inputs[i].files.back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + +std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = &f->largest; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return inputs; +} + +// helper function to determine if compaction is creating files at the +// bottommost level +bool Compaction::IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs) { + int output_l0_idx; + if (output_level == 0) { + output_l0_idx = 0; + for (const auto* file : vstorage->LevelFiles(0)) { + if (inputs[0].files.back() == file) { + break; + } + ++output_l0_idx; + } + assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size()); + } else { + output_l0_idx = -1; + } + Slice smallest_key, largest_key; + GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key); + return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key, + output_level, output_l0_idx); +} + +// test function to validate the functionality of IsBottommostLevel() +// function -- determines if compaction with inputs and storage is bottommost +bool Compaction::TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs) { + return IsBottommostLevel(output_level, vstorage, inputs); +} + +bool Compaction::IsFullCompaction( + VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs) { + size_t num_files_in_compaction = 0; + size_t total_num_files = 0; + for (int l = 0; l < vstorage->num_levels(); l++) { + total_num_files += vstorage->NumLevelFiles(l); + } + for (size_t i = 0; i < inputs.size(); i++) { + num_files_in_compaction += inputs[i].size(); + } + return num_files_in_compaction == total_num_files; +} + +Compaction::Compaction(VersionStorageInfo* vstorage, + const ImmutableCFOptions& _immutable_cf_options, + const MutableCFOptions& _mutable_cf_options, + std::vector<CompactionInputFiles> _inputs, + int _output_level, uint64_t _target_file_size, + uint64_t _max_compaction_bytes, uint32_t _output_path_id, + CompressionType _compression, + CompressionOptions _compression_opts, + uint32_t _max_subcompactions, + std::vector<FileMetaData*> _grandparents, + bool _manual_compaction, double _score, + bool _deletion_compaction, + CompactionReason _compaction_reason) + : input_vstorage_(vstorage), + start_level_(_inputs[0].level), + output_level_(_output_level), + max_output_file_size_(_target_file_size), + max_compaction_bytes_(_max_compaction_bytes), + max_subcompactions_(_max_subcompactions), + immutable_cf_options_(_immutable_cf_options), + mutable_cf_options_(_mutable_cf_options), + input_version_(nullptr), + number_levels_(vstorage->num_levels()), + cfd_(nullptr), + output_path_id_(_output_path_id), + output_compression_(_compression), + output_compression_opts_(_compression_opts), + deletion_compaction_(_deletion_compaction), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), + grandparents_(std::move(_grandparents)), + score_(_score), + bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), + is_full_compaction_(IsFullCompaction(vstorage, inputs_)), + is_manual_compaction_(_manual_compaction), + is_trivial_move_(false), + compaction_reason_(_compaction_reason) { + MarkFilesBeingCompacted(true); + if (is_manual_compaction_) { + compaction_reason_ = CompactionReason::kManualCompaction; + } + if (max_subcompactions_ == 0) { + max_subcompactions_ = immutable_cf_options_.max_subcompactions; + } + if (!bottommost_level_) { + // Currently we only enable dictionary compression during compaction to the + // bottommost level. + output_compression_opts_.max_dict_bytes = 0; + output_compression_opts_.zstd_max_train_bytes = 0; + } + +#ifndef NDEBUG + for (size_t i = 1; i < inputs_.size(); ++i) { + assert(inputs_[i].level > inputs_[i - 1].level); + } +#endif + + // setup input_levels_ + { + input_levels_.resize(num_input_levels()); + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); + } + } + + GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_); +} + +Compaction::~Compaction() { + if (input_version_ != nullptr) { + input_version_->Unref(); + } + if (cfd_ != nullptr) { + cfd_->UnrefAndTryDelete(); + } +} + +bool Compaction::InputCompressionMatchesOutput() const { + int base_level = input_vstorage_->base_level(); + bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_, + mutable_cf_options_, start_level_, + base_level) == output_compression_); + if (matches) { + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches"); + return true; + } + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch"); + return matches; +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If start_level_== output_level_, the purpose is to force compaction + // filter to be applied to that level, and thus cannot be a trivial move. + + // Check if start level have files with overlapping ranges + if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) { + // We cannot move files from L0 to L1 if the files are overlapping + return false; + } + + if (is_manual_compaction_ && + (immutable_cf_options_.compaction_filter != nullptr || + immutable_cf_options_.compaction_filter_factory != nullptr)) { + // This is a manual compaction and we have a compaction filter that should + // be executed, we cannot do a trivial move + return false; + } + + // Used in universal compaction, where trivial move can be done if the + // input files are non overlapping + if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && + (output_level_ != 0)) { + return is_trivial_move_; + } + + if (!(start_level_ != output_level_ && num_input_levels() == 1 && + input(0, 0)->fd.GetPathId() == output_path_id() && + InputCompressionMatchesOutput())) { + return false; + } + + // assert inputs_.size() == 1 + + for (const auto& file : inputs_.front().files) { + std::vector<FileMetaData*> file_grand_parents; + if (output_level_ + 1 >= number_levels_) { + continue; + } + input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + &file->largest, &file_grand_parents); + const auto compaction_size = + file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + if (compaction_size > max_compaction_bytes_) { + return false; + } + } + + return true; +} + +void Compaction::AddInputDeletions(VersionEdit* out_edit) { + for (size_t which = 0; which < num_input_levels(); which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); + } + } +} + +bool Compaction::KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector<size_t>* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast<size_t>(number_levels_)); + if (bottommost_level_) { + return true; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector<FileMetaData*>& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so it may + // exist beyond output level + return false; + } + break; + } + } + } + return true; + } + return false; +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { + for (size_t i = 0; i < num_input_levels(); i++) { + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = mark_as_compacted; + } + } +} + +// Sample output: +// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5, +// print: "3@0 + 2@3 + 1@4 files to L5" +const char* Compaction::InputLevelSummary( + InputLevelSummaryBuffer* scratch) const { + int len = 0; + bool is_first = true; + for (auto& input_level : inputs_) { + if (input_level.empty()) { + continue; + } + if (!is_first) { + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + "); + len = std::min(len, static_cast<int>(sizeof(scratch->buffer))); + } else { + is_first = false; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "%" ROCKSDB_PRIszt "@%d", input_level.size(), + input_level.level); + len = std::min(len, static_cast<int>(sizeof(scratch->buffer))); + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " files to L%d", output_level()); + + return scratch->buffer; +} + +uint64_t Compaction::CalculateTotalInputSize() const { + uint64_t size = 0; + for (auto& input_level : inputs_) { + for (auto f : input_level.files) { + size += f->fd.GetFileSize(); + } + } + return size; +} + +void Compaction::ReleaseCompactionFiles(Status status) { + MarkFilesBeingCompacted(false); + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); +} + +void Compaction::ResetNextCompactionIndex() { + assert(input_version_ != nullptr); + input_vstorage_->ResetNextCompactionIndex(start_level_); +} + +namespace { +int InputSummary(const std::vector<FileMetaData*>& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (size_t i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); + if (ret < 0 || ret >= sz) break; + write += ret; + } + // if files.size() is non-zero, overwrite the last space + return write - !!files.size(); +} +} // namespace + +void Compaction::Summary(char* output, int len) { + int write = + snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [", + input_version_->GetVersionNumber(), start_level_); + if (write < 0 || write >= len) { + return; + } + + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + if (level_iter > 0) { + write += snprintf(output + write, len - write, "], ["); + if (write < 0 || write >= len) { + return; + } + } + write += + InputSummary(inputs_[level_iter].files, output + write, len - write); + if (write < 0 || write >= len) { + return; + } + } + + snprintf(output + write, len - write, "]"); +} + +uint64_t Compaction::OutputFilePreallocationSize() const { + uint64_t preallocation_size = 0; + + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + preallocation_size += file->fd.GetFileSize(); + } + } + + if (max_output_file_size_ != port::kMaxUint64 && + (immutable_cf_options_.compaction_style == kCompactionStyleLevel || + output_level() > 0)) { + preallocation_size = std::min(max_output_file_size_, preallocation_size); + } + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold + // No point to prellocate more than 1GB. + return std::min(uint64_t{1073741824}, + preallocation_size + (preallocation_size / 10)); +} + +std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const { + if (!cfd_->ioptions()->compaction_filter_factory) { + return nullptr; + } + + CompactionFilter::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.column_family_id = cfd_->GetID(); + return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( + context); +} + +bool Compaction::IsOutputLevelEmpty() const { + return inputs_.back().level != output_level_ || inputs_.back().empty(); +} + +bool Compaction::ShouldFormSubcompactions() const { + if (max_subcompactions_ <= 1 || cfd_ == nullptr) { + return false; + } + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 && + !IsOutputLevelEmpty(); + } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + return number_levels_ > 1 && output_level_ > 0; + } else { + return false; + } +} + +uint64_t Compaction::MinInputFileOldestAncesterTime() const { + uint64_t min_oldest_ancester_time = port::kMaxUint64; + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0) { + min_oldest_ancester_time = + std::min(min_oldest_ancester_time, oldest_ancester_time); + } + } + } + return min_oldest_ancester_time; +} + +int Compaction::GetInputBaseLevel() const { + return input_vstorage_->base_level(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h new file mode 100644 index 000000000..9358e50ff --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.h @@ -0,0 +1,384 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. + +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; +}; + +// The structure that manages compaction input files associated +// with the same physical level. +struct CompactionInputFiles { + int level; + std::vector<FileMetaData*> files; + std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries; + inline bool empty() const { return files.empty(); } + inline size_t size() const { return files.size(); } + inline void clear() { files.clear(); } + inline FileMetaData* operator[](size_t i) const { return files[i]; } +}; + +class Version; +class ColumnFamilyData; +class VersionStorageInfo; +class CompactionFilter; + +// A Compaction encapsulates metadata about a compaction. +class Compaction { + public: + Compaction(VersionStorageInfo* input_version, + const ImmutableCFOptions& immutable_cf_options, + const MutableCFOptions& mutable_cf_options, + std::vector<CompactionInputFiles> inputs, int output_level, + uint64_t target_file_size, uint64_t max_compaction_bytes, + uint32_t output_path_id, CompressionType compression, + CompressionOptions compression_opts, uint32_t max_subcompactions, + std::vector<FileMetaData*> grandparents, + bool manual_compaction = false, double score = -1, + bool deletion_compaction = false, + CompactionReason compaction_reason = CompactionReason::kUnknown); + + // No copying allowed + Compaction(const Compaction&) = delete; + void operator=(const Compaction&) = delete; + + ~Compaction(); + + // Returns the level associated to the specified compaction input level. + // If compaction_input_level is not specified, then input_level is set to 0. + int level(size_t compaction_input_level = 0) const { + return inputs_[compaction_input_level].level; + } + + int start_level() const { return start_level_; } + + // Outputs will go to this level + int output_level() const { return output_level_; } + + // Returns the number of input levels in this compaction. + size_t num_input_levels() const { return inputs_.size(); } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // Returns the number of input files associated to the specified + // compaction input level. + // The function will return 0 if when "compaction_input_level" < 0 + // or "compaction_input_level" >= "num_input_levels()". + size_t num_input_files(size_t compaction_input_level) const { + if (compaction_input_level < inputs_.size()) { + return inputs_[compaction_input_level].size(); + } + return 0; + } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + // Returns the ColumnFamilyData associated with the compaction. + ColumnFamilyData* column_family_data() const { return cfd_; } + + // Returns the file meta data of the 'i'th input file at the + // specified compaction input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + FileMetaData* input(size_t compaction_input_level, size_t i) const { + assert(compaction_input_level < inputs_.size()); + return inputs_[compaction_input_level][i]; + } + + const std::vector<AtomicCompactionUnitBoundary>* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + + // Returns the list of file meta data of the specified compaction + // input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + const std::vector<FileMetaData*>* inputs( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].files; + } + + const std::vector<CompactionInputFiles>* inputs() { return &inputs_; } + + // Returns the LevelFilesBrief of the specified compaction input level. + const LevelFilesBrief* input_levels(size_t compaction_input_level) const { + return &input_levels_[compaction_input_level]; + } + + // Maximum size of files to build during this compaction. + uint64_t max_output_file_size() const { return max_output_file_size_; } + + // What compression for output + CompressionType output_compression() const { return output_compression_; } + + // What compression options for output + CompressionOptions output_compression_opts() const { + return output_compression_opts_; + } + + // Whether need to write output file to second DB path. + uint32_t output_path_id() const { return output_path_id_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // If true, then the compaction can be done by simply deleting input files. + bool deletion_compaction() const { return deletion_compaction_; } + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the available information we have guarantees that + // the input "user_key" does not exist in any level beyond "output_level()". + bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, + std::vector<size_t>* level_ptrs) const; + + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Status status); + + // Returns the summary of the compaction in "output" with maximum "len" + // in bytes. The caller is responsible for the memory management of + // "output". + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level() const { return bottommost_level_; } + + // Does this compaction include all sst files? + bool is_full_compaction() const { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool is_manual_compaction() const { return is_manual_compaction_; } + + // Used when allow_trivial_move option is set in + // Universal compaction. If all the input files are + // non overlapping, then is_trivial_move_ variable + // will be set true, else false + void set_is_trivial_move(bool trivial_move) { + is_trivial_move_ = trivial_move; + } + + // Used when allow_trivial_move option is set in + // Universal compaction. Returns true, if the input files + // are non-overlapping and can be trivially moved. + bool is_trivial_move() const { return is_trivial_move_; } + + // How many total levels are there? + int number_levels() const { return number_levels_; } + + // Return the ImmutableCFOptions that should be used throughout the compaction + // procedure + const ImmutableCFOptions* immutable_cf_options() const { + return &immutable_cf_options_; + } + + // Return the MutableCFOptions that should be used throughout the compaction + // procedure + const MutableCFOptions* mutable_cf_options() const { + return &mutable_cf_options_; + } + + // Returns the size in bytes that the output file should be preallocated to. + // In level compaction, that is max_file_size_. In universal compaction, that + // is the sum of all input file sizes. + uint64_t OutputFilePreallocationSize() const; + + void SetInputVersion(Version* input_version); + + struct InputLevelSummaryBuffer { + char buffer[128]; + }; + + const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const; + + uint64_t CalculateTotalInputSize() const; + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); + + // Create a CompactionFilter from compaction_filter_factory + std::unique_ptr<CompactionFilter> CreateCompactionFilter() const; + + // Is the input level corresponding to output_level_ empty? + bool IsOutputLevelEmpty() const; + + // Should this compaction be broken up into smaller ones run in parallel? + bool ShouldFormSubcompactions() const; + + // test function to validate the functionality of IsBottommostLevel() + // function -- determines if compaction with inputs and storage is bottommost + static bool TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs); + + TablePropertiesCollection GetOutputTableProperties() const { + return output_table_properties_; + } + + void SetOutputTableProperties(TablePropertiesCollection tp) { + output_table_properties_ = std::move(tp); + } + + Slice GetSmallestUserKey() const { return smallest_user_key_; } + + Slice GetLargestUserKey() const { return largest_user_key_; } + + int GetInputBaseLevel() const; + + CompactionReason compaction_reason() { return compaction_reason_; } + + const std::vector<FileMetaData*>& grandparents() const { + return grandparents_; + } + + uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } + + uint32_t max_subcompactions() const { return max_subcompactions_; } + + uint64_t MinInputFileOldestAncesterTime() const; + + private: + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool mark_as_compacted); + + // get the smallest and largest key present in files to be compacted + static void GetBoundaryKeys(VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs, + Slice* smallest_key, Slice* largest_key); + + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb + // down appropriate key boundaries to RangeDelAggregator during compaction. + static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs); + + // helper function to determine if compaction with inputs and storage is + // bottommost + static bool IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs); + + static bool IsFullCompaction(VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs); + + VersionStorageInfo* input_vstorage_; + + const int start_level_; // the lowest level to be compacted + const int output_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t max_compaction_bytes_; + uint32_t max_subcompactions_; + const ImmutableCFOptions immutable_cf_options_; + const MutableCFOptions mutable_cf_options_; + Version* input_version_; + VersionEdit edit_; + const int number_levels_; + ColumnFamilyData* cfd_; + Arena arena_; // Arena used to allocate space for file_levels_ + + const uint32_t output_path_id_; + CompressionType output_compression_; + CompressionOptions output_compression_opts_; + // If true, then the comaction can be done by simply deleting input files. + const bool deletion_compaction_; + + // Compaction input files organized by level. Constant after construction + const std::vector<CompactionInputFiles> inputs_; + + // A copy of inputs_, organized more closely in memory + autovector<LevelFilesBrief, 2> input_levels_; + + // State used to check for number of overlapping grandparent files + // (grandparent == "output_level_ + 1") + std::vector<FileMetaData*> grandparents_; + const double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + const bool bottommost_level_; + // Does this compaction include all sst files? + const bool is_full_compaction_; + + // Is this compaction requested by the client? + const bool is_manual_compaction_; + + // True if we can do trivial move in Universal multi level + // compaction + bool is_trivial_move_; + + // Does input compression match the output compression? + bool InputCompressionMatchesOutput() const; + + // table properties of output files + TablePropertiesCollection output_table_properties_; + + // smallest user keys in compaction + Slice smallest_user_key_; + + // largest user keys in compaction + Slice largest_user_key_; + + // Reason for compaction + CompactionReason compaction_reason_; +}; + +// Return sum of sizes of all files in `files`. +extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h new file mode 100644 index 000000000..963c1d8eb --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h @@ -0,0 +1,37 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +struct CompactionIterationStats { + // Compaction statistics + + // Doesn't include records skipped because of + // CompactionFilter::Decision::kRemoveAndSkipUntil. + int64_t num_record_drop_user = 0; + + int64_t num_record_drop_hidden = 0; + int64_t num_record_drop_obsolete = 0; + int64_t num_record_drop_range_del = 0; + int64_t num_range_del_drop_obsolete = 0; + // Deletions obsoleted before bottom level due to file gap optimization. + int64_t num_optimized_del_drop_obsolete = 0; + uint64_t total_filter_time = 0; + + // Input statistics + // TODO(noetzli): The stats are incomplete. They are lacking everything + // consumed by MergeHelper. + uint64_t num_input_records = 0; + uint64_t num_input_deletion_records = 0; + uint64_t num_input_corrupt_records = 0; + uint64_t total_input_raw_key_bytes = 0; + uint64_t total_input_raw_value_bytes = 0; + + // Single-Delete diagnostics for exceptional situations + uint64_t num_single_del_fallthru = 0; + uint64_t num_single_del_mismatch = 0; +}; diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc new file mode 100644 index 000000000..1bebfc717 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.cc @@ -0,0 +1,774 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <cinttypes> + +#include "db/compaction/compaction_iterator.h" +#include "db/snapshot_checker.h" +#include "port/likely.h" +#include "rocksdb/listener.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +#define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \ + ((seq) <= (snapshot) && \ + (snapshot_checker_ == nullptr || \ + LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \ + SnapshotCheckerResult::kInSnapshot))) + +#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot) \ + ((seq) > (snapshot) || \ + (snapshot_checker_ != nullptr && \ + UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \ + SnapshotCheckerResult::kNotInSnapshot))) + +#define IN_EARLIEST_SNAPSHOT(seq) \ + ((seq) <= earliest_snapshot_ && \ + (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq)))) + +namespace ROCKSDB_NAMESPACE { + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction, + const CompactionFilter* compaction_filter, + const std::atomic<bool>* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + const std::atomic<bool>* manual_compaction_paused, + const std::shared_ptr<Logger> info_log) + : CompactionIterator( + input, cmp, merge_helper, last_sequence, snapshots, + earliest_write_conflict_snapshot, snapshot_checker, env, + report_detailed_time, expect_valid_internal_key, range_del_agg, + std::unique_ptr<CompactionProxy>( + compaction ? new CompactionProxy(compaction) : nullptr), + compaction_filter, shutting_down, preserve_deletes_seqnum, + manual_compaction_paused, info_log) {} + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + std::unique_ptr<CompactionProxy> compaction, + const CompactionFilter* compaction_filter, + const std::atomic<bool>* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + const std::atomic<bool>* manual_compaction_paused, + const std::shared_ptr<Logger> info_log) + : input_(input), + cmp_(cmp), + merge_helper_(merge_helper), + snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + env_(env), + report_detailed_time_(report_detailed_time), + expect_valid_internal_key_(expect_valid_internal_key), + range_del_agg_(range_del_agg), + compaction_(std::move(compaction)), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + manual_compaction_paused_(manual_compaction_paused), + preserve_deletes_seqnum_(preserve_deletes_seqnum), + current_user_key_sequence_(0), + current_user_key_snapshot_(0), + merge_out_iter_(merge_helper_), + current_key_committed_(false), + info_log_(info_log) { + assert(compaction_filter_ == nullptr || compaction_ != nullptr); + assert(snapshots_ != nullptr); + bottommost_level_ = + compaction_ == nullptr ? false : compaction_->bottommost_level(); + if (compaction_ != nullptr) { + level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0); + } + if (snapshots_->size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip_ = true; + earliest_snapshot_iter_ = snapshots_->end(); + earliest_snapshot_ = kMaxSequenceNumber; + latest_snapshot_ = 0; + } else { + visible_at_tip_ = false; + earliest_snapshot_iter_ = snapshots_->begin(); + earliest_snapshot_ = snapshots_->at(0); + latest_snapshot_ = snapshots_->back(); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } +#endif + input_->SetPinnedItersMgr(&pinned_iters_mgr_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); +} + +CompactionIterator::~CompactionIterator() { + // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime + input_->SetPinnedItersMgr(nullptr); +} + +void CompactionIterator::ResetRecordCounts() { + iter_stats_.num_record_drop_user = 0; + iter_stats_.num_record_drop_hidden = 0; + iter_stats_.num_record_drop_obsolete = 0; + iter_stats_.num_record_drop_range_del = 0; + iter_stats_.num_range_del_drop_obsolete = 0; + iter_stats_.num_optimized_del_drop_obsolete = 0; +} + +void CompactionIterator::SeekToFirst() { + NextFromInput(); + PrepareOutput(); +} + +void CompactionIterator::Next() { + // If there is a merge output, return it before continuing to process the + // input. + if (merge_out_iter_.Valid()) { + merge_out_iter_.Next(); + + // Check if we returned all records of the merge output. + if (merge_out_iter_.Valid()) { + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + bool valid_key __attribute__((__unused__)); + valid_key = ParseInternalKey(key_, &ikey_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to be valid. + assert(valid_key); + if (!valid_key) { + ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", + key_.ToString(true).c_str()); + } + + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + valid_ = true; + } else { + // We consumed all pinned merge operands, release pinned iterators + pinned_iters_mgr_.ReleasePinnedData(); + // MergeHelper moves the iterator to the first record after the merged + // records, so even though we reached the end of the merge output, we do + // not want to advance the iterator. + NextFromInput(); + } + } else { + // Only advance the input iterator if there is no merge output and the + // iterator is not already at the next record. + if (!at_next_) { + input_->Next(); + } + NextFromInput(); + } + + if (valid_) { + // Record that we've outputted a record for the current key. + has_outputted_key_ = true; + } + + PrepareOutput(); +} + +void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, + Slice* skip_until) { + if (compaction_filter_ != nullptr && + (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) { + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. If the return value of the compaction filter is true, + // replace the entry with a deletion marker. + CompactionFilter::Decision filter; + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_; + { + StopWatchNano timer(env_, report_detailed_time_); + filter = compaction_filter_->FilterV2( + compaction_->level(), filter_key, value_type, value_, + &compaction_filter_value_, compaction_filter_skip_until_.rep()); + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } + + if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2 documentation. + filter = CompactionFilter::Decision::kKeep; + } + + if (filter == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kChangeValue) { + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } + } +} + +void CompactionIterator::NextFromInput() { + at_next_ = false; + valid_ = false; + + while (!valid_ && input_->Valid() && !IsPausingManualCompaction() && + !IsShuttingDown()) { + key_ = input_->key(); + value_ = input_->value(); + iter_stats_.num_input_records++; + + if (!ParseInternalKey(key_, &ikey_)) { + // If `expect_valid_internal_key_` is false, return the corrupted key + // and let the caller decide what to do with it. + // TODO(noetzli): We should have a more elegant solution for this. + if (expect_valid_internal_key_) { + assert(!"Corrupted internal key not expected."); + status_ = Status::Corruption("Corrupted internal key not expected."); + break; + } + key_ = current_key_.SetInternalKey(key_); + has_current_user_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + iter_stats_.num_input_corrupt_records++; + valid_ = true; + break; + } + TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); + + // Update input statistics + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + iter_stats_.num_input_deletion_records++; + } + iter_stats_.total_input_raw_key_bytes += key_.size(); + iter_stats_.total_input_raw_value_bytes += value_.size(); + + // If need_skip is true, we should seek the input iterator + // to internal key skip_until and continue from there. + bool need_skip = false; + // Points either into compaction_filter_skip_until_ or into + // merge_helper_->compaction_filter_skip_until_. + Slice skip_until; + + // Check whether the user key changed. After this if statement current_key_ + // is a copy of the current input key (maybe converted to a delete by the + // compaction filter). ikey_.user_key is pointing to the copy. + if (!has_current_user_key_ || + !cmp_->Equal(ikey_.user_key, current_user_key_)) { + // First occurrence of this user key + // Copy key for output + key_ = current_key_.SetInternalKey(key_, &ikey_); + current_user_key_ = ikey_.user_key; + has_current_user_key_ = true; + has_outputted_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + current_key_committed_ = KeyCommitted(ikey_.sequence); + + // Apply the compaction filter to the first committed version of the user + // key. + if (current_key_committed_) { + InvokeFilterIfNeeded(&need_skip, &skip_until); + } + } else { + // Update the current key to reflect the new sequence number/type without + // copying the user key. + // TODO(rven): Compaction filter does not process keys in this path + // Need to have the compaction filter process multiple versions + // if we have versions on both sides of a snapshot + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + + // Note that newer version of a key is ordered before older versions. If a + // newer version of a key is committed, so as the older version. No need + // to query snapshot_checker_ in that case. + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + current_key_committed_ = KeyCommitted(ikey_.sequence); + // Apply the compaction filter to the first committed version of the + // user key. + if (current_key_committed_) { + InvokeFilterIfNeeded(&need_skip, &skip_until); + } + } + } + + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + valid_ = true; + break; + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find the earliest + // snapshot that is affected by this kv. + SequenceNumber last_sequence __attribute__((__unused__)); + last_sequence = current_user_key_sequence_; + current_user_key_sequence_ = ikey_.sequence; + SequenceNumber last_snapshot = current_user_key_snapshot_; + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + current_user_key_snapshot_ = + visible_at_tip_ + ? earliest_snapshot_ + : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot); + + if (need_skip) { + // This case is handled below. + } else if (clear_and_output_next_key_) { + // In the previous iteration we encountered a single delete that we could + // not compact out. We will keep this Put, but can drop it's data. + // (See Optimization 3, below.) + assert(ikey_.type == kTypeValue); + if (ikey_.type != kTypeValue) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected key type %d for compaction output", + ikey_.type); + } + assert(current_user_key_snapshot_ == last_snapshot); + if (current_user_key_snapshot_ != last_snapshot) { + ROCKS_LOG_FATAL(info_log_, + "current_user_key_snapshot_ (%" PRIu64 + ") != last_snapshot (%" PRIu64 ")", + current_user_key_snapshot_, last_snapshot); + } + + value_.clear(); + valid_ = true; + clear_and_output_next_key_ = false; + } else if (ikey_.type == kTypeSingleDeletion) { + // We can compact out a SingleDelete if: + // 1) We encounter the corresponding PUT -OR- we know that this key + // doesn't appear past this output level + // =AND= + // 2) We've already returned a record in this snapshot -OR- + // there are no earlier earliest_write_conflict_snapshot. + // + // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to + // allow Transactions to do write-conflict checking (if we compacted away + // all keys, then we wouldn't know that a write happened in this + // snapshot). If there is no earlier snapshot, then we know that there + // are no active transactions that need to know about any writes. + // + // Optimization 3: + // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT + // true, then we must output a SingleDelete. In this case, we will decide + // to also output the PUT. While we are compacting less by outputting the + // PUT now, hopefully this will lead to better compaction in the future + // when Rule 2 is later true (Ie, We are hoping we can later compact out + // both the SingleDelete and the Put, while we couldn't if we only + // outputted the SingleDelete now). + // In this case, we can save space by removing the PUT's value as it will + // never be read. + // + // Deletes and Merges are not supported on the same key that has a + // SingleDelete as it is not possible to correctly do any partial + // compaction of such a combination of operations. The result of mixing + // those operations for a given key is documented as being undefined. So + // we can choose how to handle such a combinations of operations. We will + // try to compact out as much as we can in these cases. + // We will report counts on these anomalous cases. + + // The easiest way to process a SingleDelete during iteration is to peek + // ahead at the next key. + ParsedInternalKey next_ikey; + input_->Next(); + + // Check whether the next key exists, is not corrupt, and is the same key + // as the single delete. + if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. + if (prev_snapshot == 0 || + DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) { + if (next_ikey.type == kTypeSingleDeletion) { + // We encountered two SingleDeletes in a row. This could be due to + // unexpected user input. + // Skip the first SingleDelete and let the next iteration decide how + // to handle the second SingleDelete + + // First SingleDelete has been skipped since we already called + // input_->Next(). + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + } else if (has_outputted_key_ || + DEFINITELY_IN_SNAPSHOT( + ikey_.sequence, earliest_write_conflict_snapshot_)) { + // Found a matching value, we can drop the single delete and the + // value. It is safe to drop both records since we've already + // outputted a key in this snapshot, or there is no earlier + // snapshot (Rule 2 above). + + // Note: it doesn't matter whether the second key is a Put or if it + // is an unexpected Merge or Delete. We will compact it out + // either way. We will maintain counts of how many mismatches + // happened + if (next_ikey.type != kTypeValue && + next_ikey.type != kTypeBlobIndex) { + ++iter_stats_.num_single_del_mismatch; + } + + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + // Already called input_->Next() once. Call it a second time to + // skip past the second key. + input_->Next(); + } else { + // Found a matching value, but we cannot drop both keys since + // there is an earlier snapshot and we need to leave behind a record + // to know that a write happened in this snapshot (Rule 2 above). + // Clear the value and output the SingleDelete. (The value will be + // outputted on the next iteration.) + + // Setting valid_ to true will output the current SingleDelete + valid_ = true; + + // Set up the Put to be outputted in the next iteration. + // (Optimization 3). + clear_and_output_next_key_ = true; + } + } else { + // We hit the next snapshot without hitting a put, so the iterator + // returns the single delete. + valid_ = true; + } + } else { + // We are at the end of the input, could not parse the next key, or hit + // a different key. The iterator returns the single delete if the key + // possibly exists beyond the current output level. We set + // has_current_user_key to false so that if the iterator is at the next + // key, we do not compare it again against the previous key at the next + // iteration. If the next key is corrupt, we return before the + // comparison, so the value of has_current_user_key does not matter. + has_current_user_key_ = false; + if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // Key doesn't exist outside of this range. + // Can compact out this SingleDelete. + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_fallthru; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + } else { + // Output SingleDelete + valid_ = true; + } + } + + if (valid_) { + at_next_ = true; + } + } else if (last_snapshot == current_user_key_snapshot_ || + (last_snapshot > 0 && + last_snapshot < current_user_key_snapshot_)) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibility of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // + // Note: Dropping this key will not affect TransactionDB write-conflict + // checking since there has already been a record returned for this key + // in this snapshot. + assert(last_sequence >= current_user_key_sequence_); + if (last_sequence < current_user_key_sequence_) { + ROCKS_LOG_FATAL(info_log_, + "last_sequence (%" PRIu64 + ") < current_user_key_sequence_ (%" PRIu64 ")", + last_sequence, current_user_key_sequence_); + } + + ++iter_stats_.num_record_drop_hidden; // (A) + input_->Next(); + } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion && + IN_EARLIEST_SNAPSHOT(ikey_.sequence) && + ikeyNotNeededForIncrementalSnapshot() && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // TODO(noetzli): This is the only place where we use compaction_ + // (besides the constructor). We should probably get rid of this + // dependency and find a way to do similar filtering during flushes. + // + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + // + // Note: Dropping this Delete will not affect TransactionDB + // write-conflict checking since it is earlier than any snapshot. + // + // It seems that we can also drop deletion later than earliest snapshot + // given that: + // (1) The deletion is earlier than earliest_write_conflict_snapshot, and + // (2) No value exist earlier than the deletion. + ++iter_stats_.num_record_drop_obsolete; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + input_->Next(); + } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ && + ikeyNotNeededForIncrementalSnapshot()) { + // Handle the case where we have a delete key at the bottom most level + // We can skip outputting the key iff there are no subsequent puts for this + // key + ParsedInternalKey next_ikey; + input_->Next(); + // Skip over all versions of this key that happen to occur in the same snapshot + // range as the delete + while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key) && + (prev_snapshot == 0 || + DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) { + input_->Next(); + } + // If you find you still need to output a row with this key, we need to output the + // delete too + if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + valid_ = true; + at_next_ = true; + } + } else if (ikey_.type == kTypeMerge) { + if (!merge_helper_->HasOperator()) { + status_ = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + + pinned_iters_mgr_.StartPinning(); + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. + Status s = merge_helper_->MergeUntil(input_, range_del_agg_, + prev_snapshot, bottommost_level_); + merge_out_iter_.SeekToFirst(); + + if (!s.ok() && !s.IsMergeInProgress()) { + status_ = s; + return; + } else if (merge_out_iter_.Valid()) { + // NOTE: key, value, and ikey_ refer to old entries. + // These will be correctly set below. + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + bool valid_key __attribute__((__unused__)); + valid_key = ParseInternalKey(key_, &ikey_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to valid. + assert(valid_key); + if (!valid_key) { + ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", + key_.ToString(true).c_str()); + } + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + valid_ = true; + } else { + // all merge operands were filtered out. reset the user key, since the + // batch consumed by the merge operator should not shadow any keys + // coming after the merges + has_current_user_key_ = false; + pinned_iters_mgr_.ReleasePinnedData(); + + if (merge_helper_->FilteredUntil(&skip_until)) { + need_skip = true; + } + } + } else { + // 1. new user key -OR- + // 2. different snapshot stripe + bool should_delete = range_del_agg_->ShouldDelete( + key_, RangeDelPositioningMode::kForwardTraversal); + if (should_delete) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_range_del; + input_->Next(); + } else { + valid_ = true; + } + } + + if (need_skip) { + input_->Seek(skip_until); + } + } + + if (!valid_ && IsShuttingDown()) { + status_ = Status::ShutdownInProgress(); + } + + if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } +} + +void CompactionIterator::PrepareOutput() { + if (valid_) { + if (compaction_filter_ && ikey_.type == kTypeBlobIndex) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = Status::Corruption( + "Corrupted blob reference encountered during GC"); + valid_ = false; + } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + valid_ = false; + } else if (blob_decision == + CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + } + } + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // and the userkey differs from the last userkey in compaction + // then we can squash the seqno to zero. + // + // This is safe for TransactionDB write-conflict checking since transactions + // only care about sequence number larger than any active snapshots. + // + // Can we do the same for levels above bottom level as long as + // KeyNotExistsBeyondOutputLevel() return true? + if (valid_ && compaction_ != nullptr && + !compaction_->allow_ingest_behind() && + ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && + IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) { + assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion); + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected key type %d for seq-zero optimization", + ikey_.type); + } + ikey_.sequence = 0; + current_key_.UpdateInternalKey(0, ikey_.type); + } + } +} + +inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot) { + assert(snapshots_->size()); + if (snapshots_->size() == 0) { + ROCKS_LOG_FATAL(info_log_, + "No snapshot left in findEarliestVisibleSnapshot"); + } + auto snapshots_iter = std::lower_bound( + snapshots_->begin(), snapshots_->end(), in); + if (snapshots_iter == snapshots_->begin()) { + *prev_snapshot = 0; + } else { + *prev_snapshot = *std::prev(snapshots_iter); + assert(*prev_snapshot < in); + if (*prev_snapshot >= in) { + ROCKS_LOG_FATAL(info_log_, + "*prev_snapshot >= in in findEarliestVisibleSnapshot"); + } + } + if (snapshot_checker_ == nullptr) { + return snapshots_iter != snapshots_->end() + ? *snapshots_iter : kMaxSequenceNumber; + } + bool has_released_snapshot = !released_snapshots_.empty(); + for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { + auto cur = *snapshots_iter; + assert(in <= cur); + if (in > cur) { + ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot"); + } + // Skip if cur is in released_snapshots. + if (has_released_snapshot && released_snapshots_.count(cur) > 0) { + continue; + } + auto res = snapshot_checker_->CheckInSnapshot(in, cur); + if (res == SnapshotCheckerResult::kInSnapshot) { + return cur; + } else if (res == SnapshotCheckerResult::kSnapshotReleased) { + released_snapshots_.insert(cur); + } + *prev_snapshot = cur; + } + return kMaxSequenceNumber; +} + +// used in 2 places - prevents deletion markers to be dropped if they may be +// needed and disables seqnum zero-out in PrepareOutput for recent keys. +inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() { + return (!compaction_->preserve_deletes()) || + (ikey_.sequence < preserve_deletes_seqnum_); +} + +bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) { + assert(snapshot_checker_ != nullptr); + bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber || + (earliest_snapshot_iter_ != snapshots_->end() && + *earliest_snapshot_iter_ == earliest_snapshot_)); + assert(pre_condition); + if (!pre_condition) { + ROCKS_LOG_FATAL(info_log_, + "Pre-Condition is not hold in IsInEarliestSnapshot"); + } + auto in_snapshot = + snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); + while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) { + // Avoid the the current earliest_snapshot_ being return as + // earliest visible snapshot for the next value. So if a value's sequence + // is zero-ed out by PrepareOutput(), the next value will be compact out. + released_snapshots_.insert(earliest_snapshot_); + earliest_snapshot_iter_++; + + if (earliest_snapshot_iter_ == snapshots_->end()) { + earliest_snapshot_ = kMaxSequenceNumber; + } else { + earliest_snapshot_ = *earliest_snapshot_iter_; + } + in_snapshot = + snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); + } + assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased); + if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected released snapshot in IsInEarliestSnapshot"); + } + return in_snapshot == SnapshotCheckerResult::kInSnapshot; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h new file mode 100644 index 000000000..8be60eb9e --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.h @@ -0,0 +1,240 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <algorithm> +#include <deque> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionIterator { + public: + // A wrapper around Compaction. Has a much smaller interface, only what + // CompactionIterator uses. Tests can override it. + class CompactionProxy { + public: + explicit CompactionProxy(const Compaction* compaction) + : compaction_(compaction) {} + + virtual ~CompactionProxy() = default; + virtual int level(size_t /*compaction_input_level*/ = 0) const { + return compaction_->level(); + } + virtual bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector<size_t>* level_ptrs) const { + return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); + } + virtual bool bottommost_level() const { + return compaction_->bottommost_level(); + } + virtual int number_levels() const { return compaction_->number_levels(); } + virtual Slice GetLargestUserKey() const { + return compaction_->GetLargestUserKey(); + } + virtual bool allow_ingest_behind() const { + return compaction_->immutable_cf_options()->allow_ingest_behind; + } + virtual bool preserve_deletes() const { + return compaction_->immutable_cf_options()->preserve_deletes; + } + + protected: + CompactionProxy() = default; + + private: + const Compaction* compaction_; + }; + + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic<bool>* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic<bool>* manual_compaction_paused = nullptr, + const std::shared_ptr<Logger> info_log = nullptr); + + // Constructor with custom CompactionProxy, used for tests. + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + std::unique_ptr<CompactionProxy> compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic<bool>* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic<bool>* manual_compaction_paused = nullptr, + const std::shared_ptr<Logger> info_log = nullptr); + + ~CompactionIterator(); + + void ResetRecordCounts(); + + // Seek to the beginning of the compaction iterator output. + // + // REQUIRED: Call only once. + void SeekToFirst(); + + // Produces the next record in the compaction. + // + // REQUIRED: SeekToFirst() has been called. + void Next(); + + // Getters + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + const Status& status() const { return status_; } + const ParsedInternalKey& ikey() const { return ikey_; } + bool Valid() const { return valid_; } + const Slice& user_key() const { return current_user_key_; } + const CompactionIterationStats& iter_stats() const { return iter_stats_; } + + private: + // Processes the input stream to find the next output + void NextFromInput(); + + // Do last preparations before presenting the output to the callee. At this + // point this only zeroes out the sequence number if possible for better + // compression. + void PrepareOutput(); + + // Invoke compaction filter if needed. + void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); + + // Given a sequence number, return the sequence number of the + // earliest snapshot that this sequence number is visible in. + // The snapshots themselves are arranged in ascending order of + // sequence numbers. + // Employ a sequential search because the total number of + // snapshots are typically small. + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot); + + // Checks whether the currently seen ikey_ is needed for + // incremental (differential) snapshot and hence can't be dropped + // or seqnum be zero-ed out even if all other conditions for it are met. + inline bool ikeyNotNeededForIncrementalSnapshot(); + + inline bool KeyCommitted(SequenceNumber sequence) { + return snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(sequence, kMaxSequenceNumber) == + SnapshotCheckerResult::kInSnapshot; + } + + bool IsInEarliestSnapshot(SequenceNumber sequence); + + InternalIterator* input_; + const Comparator* cmp_; + MergeHelper* merge_helper_; + const std::vector<SequenceNumber>* snapshots_; + // List of snapshots released during compaction. + // findEarliestVisibleSnapshot() find them out from return of + // snapshot_checker, and make sure they will not be returned as + // earliest visible snapshot of an older value. + // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3. + std::unordered_set<SequenceNumber> released_snapshots_; + std::vector<SequenceNumber>::const_iterator earliest_snapshot_iter_; + const SequenceNumber earliest_write_conflict_snapshot_; + const SnapshotChecker* const snapshot_checker_; + Env* env_; + bool report_detailed_time_; + bool expect_valid_internal_key_; + CompactionRangeDelAggregator* range_del_agg_; + std::unique_ptr<CompactionProxy> compaction_; + const CompactionFilter* compaction_filter_; + const std::atomic<bool>* shutting_down_; + const std::atomic<bool>* manual_compaction_paused_; + const SequenceNumber preserve_deletes_seqnum_; + bool bottommost_level_; + bool valid_ = false; + bool visible_at_tip_; + SequenceNumber earliest_snapshot_; + SequenceNumber latest_snapshot_; + + // State + // + // Points to a copy of the current compaction iterator output (current_key_) + // if valid_. + Slice key_; + // Points to the value in the underlying iterator that corresponds to the + // current output. + Slice value_; + // The status is OK unless compaction iterator encounters a merge operand + // while not having a merge operator defined. + Status status_; + // Stores the user key, sequence number and type of the current compaction + // iterator output (or current key in the underlying iterator during + // NextFromInput()). + ParsedInternalKey ikey_; + // Stores whether ikey_.user_key is valid. If set to false, the user key is + // not compared against the current key in the underlying iterator. + bool has_current_user_key_ = false; + bool at_next_ = false; // If false, the iterator + // Holds a copy of the current compaction iterator output (or current key in + // the underlying iterator during NextFromInput()). + IterKey current_key_; + Slice current_user_key_; + SequenceNumber current_user_key_sequence_; + SequenceNumber current_user_key_snapshot_; + + // True if the iterator has already returned a record for the current key. + bool has_outputted_key_ = false; + + // truncated the value of the next key and output it without applying any + // compaction rules. This is used for outputting a put after a single delete. + bool clear_and_output_next_key_ = false; + + MergeOutputIterator merge_out_iter_; + // PinnedIteratorsManager used to pin input_ Iterator blocks while reading + // merge operands and then releasing them after consuming them. + PinnedIteratorsManager pinned_iters_mgr_; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + // "level_ptrs" holds indices that remember which file of an associated + // level we were last checking during the last call to compaction-> + // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function + // to pick off where it left off since each subcompaction's key range is + // increasing so a later call to the function must be looking for a key that + // is in or beyond the last file checked during the previous call + std::vector<size_t> level_ptrs_; + CompactionIterationStats iter_stats_; + + // Used to avoid purging uncommitted values. The application can specify + // uncommitted values by providing a SnapshotChecker object. + bool current_key_committed_; + std::shared_ptr<Logger> info_log_; + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } + + bool IsPausingManualCompaction() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc new file mode 100644 index 000000000..0c50fb9ba --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc @@ -0,0 +1,976 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include <string> +#include <vector> + +#include "db/compaction/compaction_iterator.h" +#include "port/port.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// Expects no merging attempts. +class NoMergingMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + ADD_FAILURE(); + return false; + } + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque<Slice>& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + ADD_FAILURE(); + return false; + } + const char* Name() const override { + return "CompactionIteratorTest NoMergingMergeOp"; + } +}; + +// Compaction filter that gets stuck when it sees a particular key, +// then gets unstuck when told to. +// Always returns Decition::kRemove. +class StallingFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int k = std::atoi(key.ToString().c_str()); + last_seen.store(k); + while (k >= stall_at.load()) { + std::this_thread::yield(); + } + return Decision::kRemove; + } + + const char* Name() const override { + return "CompactionIteratorTest StallingFilter"; + } + + // Wait until the filter sees a key >= k and stalls at that key. + // If `exact`, asserts that the seen key is equal to k. + void WaitForStall(int k, bool exact = true) { + stall_at.store(k); + while (last_seen.load() < k) { + std::this_thread::yield(); + } + if (exact) { + EXPECT_EQ(k, last_seen.load()); + } + } + + // Filter will stall on key >= stall_at. Advance stall_at to unstall. + mutable std::atomic<int> stall_at{0}; + // Last key the filter was called with. + mutable std::atomic<int> last_seen{0}; +}; + +// Compaction filter that filter out all keys. +class FilterAllKeysCompactionFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return Decision::kRemove; + } + + const char* Name() const override { return "AllKeysCompactionFilter"; } +}; + +class LoggingForwardVectorIterator : public InternalIterator { + public: + struct Action { + enum class Type { + SEEK_TO_FIRST, + SEEK, + NEXT, + }; + + Type type; + std::string arg; + + explicit Action(Type _type, std::string _arg = "") + : type(_type), arg(_arg) {} + + bool operator==(const Action& rhs) const { + return std::tie(type, arg) == std::tie(rhs.type, rhs.arg); + } + }; + + LoggingForwardVectorIterator(const std::vector<std::string>& keys, + const std::vector<std::string>& values) + : keys_(keys), values_(values), current_(keys.size()) { + assert(keys_.size() == values_.size()); + } + + bool Valid() const override { return current_ < keys_.size(); } + + void SeekToFirst() override { + log.emplace_back(Action::Type::SEEK_TO_FIRST); + current_ = 0; + } + void SeekToLast() override { assert(false); } + + void Seek(const Slice& target) override { + log.emplace_back(Action::Type::SEEK, target.ToString()); + current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + + void SeekForPrev(const Slice& /*target*/) override { assert(false); } + + void Next() override { + assert(Valid()); + log.emplace_back(Action::Type::NEXT); + current_++; + } + void Prev() override { assert(false); } + + Slice key() const override { + assert(Valid()); + return Slice(keys_[current_]); + } + Slice value() const override { + assert(Valid()); + return Slice(values_[current_]); + } + + Status status() const override { return Status::OK(); } + + std::vector<Action> log; + + private: + std::vector<std::string> keys_; + std::vector<std::string> values_; + size_t current_; +}; + +class FakeCompaction : public CompactionIterator::CompactionProxy { + public: + FakeCompaction() = default; + + int level(size_t /*compaction_input_level*/) const override { return 0; } + bool KeyNotExistsBeyondOutputLevel( + const Slice& /*user_key*/, + std::vector<size_t>* /*level_ptrs*/) const override { + return is_bottommost_level || key_not_exists_beyond_output_level; + } + bool bottommost_level() const override { return is_bottommost_level; } + int number_levels() const override { return 1; } + Slice GetLargestUserKey() const override { + return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + } + bool allow_ingest_behind() const override { return false; } + + bool preserve_deletes() const override { return false; } + + bool key_not_exists_beyond_output_level = false; + + bool is_bottommost_level = false; +}; + +// A simplifed snapshot checker which assumes each snapshot has a global +// last visible sequence. +class TestSnapshotChecker : public SnapshotChecker { + public: + explicit TestSnapshotChecker( + SequenceNumber last_committed_sequence, + const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {{}}) + : last_committed_sequence_(last_committed_sequence), + snapshots_(snapshots) {} + + SnapshotCheckerResult CheckInSnapshot( + SequenceNumber seq, SequenceNumber snapshot_seq) const override { + if (snapshot_seq == kMaxSequenceNumber) { + return seq <= last_committed_sequence_ + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + assert(snapshots_.count(snapshot_seq) > 0); + return seq <= snapshots_.at(snapshot_seq) + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + + private: + SequenceNumber last_committed_sequence_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map<SequenceNumber, SequenceNumber> snapshots_; +}; + +// Test param: +// bool: whether to pass snapshot_checker to compaction iterator. +class CompactionIteratorTest : public testing::TestWithParam<bool> { + public: + CompactionIteratorTest() + : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {} + + void InitIterators( + const std::vector<std::string>& ks, const std::vector<std::string>& vs, + const std::vector<std::string>& range_del_ks, + const std::vector<std::string>& range_del_vs, + SequenceNumber last_sequence, + SequenceNumber last_committed_sequence = kMaxSequenceNumber, + MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + std::unique_ptr<InternalIterator> unfragmented_range_del_iter( + new test::VectorIterator(range_del_ks, range_del_vs)); + auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>( + std::move(unfragmented_range_del_iter), icmp_); + std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( + new FragmentedRangeTombstoneIterator(tombstone_list, icmp_, + kMaxSequenceNumber)); + range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_)); + range_del_agg_->AddTombstones(std::move(range_del_iter)); + + std::unique_ptr<CompactionIterator::CompactionProxy> compaction; + if (filter || bottommost_level) { + compaction_proxy_ = new FakeCompaction(); + compaction_proxy_->is_bottommost_level = bottommost_level; + compaction.reset(compaction_proxy_); + } + bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); + if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) { + snapshot_checker_.reset( + new TestSnapshotChecker(last_committed_sequence, snapshot_map_)); + } + merge_helper_.reset( + new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false, + 0 /*latest_snapshot*/, snapshot_checker_.get(), + 0 /*level*/, nullptr /*statistics*/, &shutting_down_)); + + iter_.reset(new LoggingForwardVectorIterator(ks, vs)); + iter_->SeekToFirst(); + c_iter_.reset(new CompactionIterator( + iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, + earliest_write_conflict_snapshot, snapshot_checker_.get(), + Env::Default(), false /* report_detailed_time */, false, + range_del_agg_.get(), std::move(compaction), filter, &shutting_down_)); + } + + void AddSnapshot(SequenceNumber snapshot, + SequenceNumber last_visible_seq = kMaxSequenceNumber) { + snapshots_.push_back(snapshot); + snapshot_map_[snapshot] = last_visible_seq; + } + + virtual bool UseSnapshotChecker() const { return false; } + + void RunTest( + const std::vector<std::string>& input_keys, + const std::vector<std::string>& input_values, + const std::vector<std::string>& expected_keys, + const std::vector<std::string>& expected_values, + SequenceNumber last_committed_seq = kMaxSequenceNumber, + MergeOperator* merge_operator = nullptr, + CompactionFilter* compaction_filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber, + last_committed_seq, merge_operator, compaction_filter, + bottommost_level, earliest_write_conflict_snapshot); + c_iter_->SeekToFirst(); + for (size_t i = 0; i < expected_keys.size(); i++) { + std::string info = "i = " + ToString(i); + ASSERT_TRUE(c_iter_->Valid()) << info; + ASSERT_OK(c_iter_->status()) << info; + ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info; + ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info; + c_iter_->Next(); + } + ASSERT_FALSE(c_iter_->Valid()); + } + + const Comparator* cmp_; + const InternalKeyComparator icmp_; + std::vector<SequenceNumber> snapshots_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_; + std::unique_ptr<MergeHelper> merge_helper_; + std::unique_ptr<LoggingForwardVectorIterator> iter_; + std::unique_ptr<CompactionIterator> c_iter_; + std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_; + std::unique_ptr<SnapshotChecker> snapshot_checker_; + std::atomic<bool> shutting_down_{false}; + FakeCompaction* compaction_proxy_; +}; + +// It is possible that the output of the compaction iterator is empty even if +// the input is not. +TEST_P(CompactionIteratorTest, EmptyResult) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue)}, + {"", "val"}, {}, {}, 5); + c_iter_->SeekToFirst(); + ASSERT_FALSE(c_iter_->Valid()); +} + +// If there is a corruption after a single deletion, the corrupted key should +// be preserved. +TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue, true), + test::KeyStr("b", 10, kTypeValue)}, + {"", "val", "val2"}, {}, {}, 10); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion), + c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, SimpleRangeDeletion) { + InitIterators({test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("morning", 2, kTypeValue), + test::KeyStr("night", 3, kTypeValue)}, + {"zao", "zao", "wan"}, + {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) { + AddSnapshot(10); + std::vector<std::string> ks1; + ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion)); + std::vector<std::string> vs1{"mz"}; + std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue), + test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("night", 40, kTypeValue), + test::KeyStr("night", 20, kTypeValue)}; + std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"}; + InitIterators(ks2, vs2, ks1, vs1, 40); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* skip_until) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("av50", v); + return Decision::kKeep; + } + if (k == "b") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("bv60", v); + *skip_until = "d+"; + return Decision::kRemoveAndSkipUntil; + } + if (k == "e") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("em71", v); + return Decision::kKeep; + } + if (k == "f") { + if (v == "fm65") { + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "f"; + } else { + EXPECT_EQ("fm30", v); + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "g+"; + } + return Decision::kRemoveAndSkipUntil; + } + if (k == "h") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("hv91", v); + return Decision::kKeep; + } + if (k == "i") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("im95", v); + *skip_until = "z"; + return Decision::kRemoveAndSkipUntil; + } + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter"; + } + }; + + NoMergingMergeOp merge_op; + Filter filter; + InitIterators( + {test::KeyStr("a", 50, kTypeValue), // keep + test::KeyStr("a", 45, kTypeMerge), + test::KeyStr("b", 60, kTypeValue), // skip to "d+" + test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue), + test::KeyStr("d", 70, kTypeMerge), + test::KeyStr("e", 71, kTypeMerge), // keep + test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep + test::KeyStr("f", 30, kTypeMerge), // skip to "g+" + test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue), + test::KeyStr("h", 91, kTypeValue), // keep + test::KeyStr("i", 95, kTypeMerge), // skip to "z" + test::KeyStr("j", 99, kTypeValue)}, + {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30", + "fv25", "gv90", "hv91", "im95", "jv99"}, + {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter); + + // Compaction should output just "a", "e" and "h" keys. + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("av50", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("em71", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("hv91", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); + + // Check that the compaction iterator did the correct sequence of calls on + // the underlying iterator. + using A = LoggingForwardVectorIterator::Action; + using T = A::Type; + std::vector<A> expected_actions = { + A(T::SEEK_TO_FIRST), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))}; + ASSERT_EQ(expected_actions, iter_->log); +} + +TEST_P(CompactionIteratorTest, ShuttingDownInFilter) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue), + test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + // Don't leave tombstones (kTypeDeletion) for filtered keys. + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic<bool> seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + EXPECT_FALSE(c_iter_->Valid()); + EXPECT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +// Same as ShuttingDownInFilter, but shutdown happens during filter call for +// a merge operand, not for a value. +TEST_P(CompactionIteratorTest, ShuttingDownInMerge) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge), + test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic<bool> seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + ASSERT_FALSE(c_iter_->Valid()); + ASSERT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +TEST_P(CompactionIteratorTest, SingleMergeOperand) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("av1", v); + return Decision::kKeep; + } else if (k == "b") { + EXPECT_EQ(ValueType::kMergeOperand, t); + return Decision::kKeep; + } else if (k == "c") { + return Decision::kKeep; + } + + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.SingleMergeOperand::Filter"; + } + }; + + class SingleMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + // See InitIterators() call below for why "c" is the only key for which + // FullMergeV2 should be called. + EXPECT_EQ("c", merge_in.key.ToString()); + + std::string temp_value; + if (merge_in.existing_value != nullptr) { + temp_value = merge_in.existing_value->ToString(); + } + + for (auto& operand : merge_in.operand_list) { + temp_value.append(operand.ToString()); + } + merge_out->new_value = temp_value; + + return true; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque<Slice>& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + std::string string_key = key.ToString(); + EXPECT_TRUE(string_key == "a" || string_key == "b"); + + if (string_key == "a") { + EXPECT_EQ(1, operand_list.size()); + } else if (string_key == "b") { + EXPECT_EQ(2, operand_list.size()); + } + + std::string temp_value; + for (auto& operand : operand_list) { + temp_value.append(operand.ToString()); + } + swap(temp_value, *new_value); + + return true; + } + + const char* Name() const override { + return "CompactionIteratorTest SingleMergeOp"; + } + + bool AllowSingleOperand() const override { return true; } + }; + + SingleMergeOp merge_op; + Filter filter; + InitIterators( + // a should invoke PartialMergeMulti with a single merge operand. + {test::KeyStr("a", 50, kTypeMerge), + // b should invoke PartialMergeMulti with two operands. + test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge), + // c should invoke FullMerge due to kTypeValue at the beginning. + test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)}, + {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, &merge_op, &filter); + + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("av1", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ("bv1bv2", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); +} + +// In bottommost level, values earlier than earliest snapshot can be output +// with sequence = 0. +TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// In bottommost level, deletions earlier than earliest snapshot can be removed +// permanently. +TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeDeletion), + test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, + kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +// In bottommost level, single deletions earlier than earliest snapshot can be +// removed permanently. +TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion)}, + {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""}, + kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, + testing::Values(true, false)); + +// Tests how CompactionIterator work together with SnapshotChecker. +class CompactionIteratorWithSnapshotCheckerTest + : public CompactionIteratorTest { + public: + bool UseSnapshotChecker() const override { return true; } +}; + +// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is +// while committed version of these keys should get compacted as usual. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Value) { + RunTest( + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Deletion) { + RunTest({test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Merge) { + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_SingleDelete) { + RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_BlobIndex) { + RunTest({test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +// Test compaction iterator dedup keys visible to the same snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) { + AddSnapshot(2, 1); + AddSnapshot(4, 3); + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + DedupSameSnapshot_SingleDeletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeSingleDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +// At bottom level, sequence numbers can be zero out, and deletions can be +// removed, but only when they are visible to earliest snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), + test::KeyStr("c", 3, kTypeDeletion)}, + {"", "", ""}, + {}, + {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfValuePresentToEarlierSnapshot) { + AddSnapshot(2,1); + RunTest( + {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue), + test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", "", ""}, + {test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// Single delete should not cancel out values that not visible to the +// same set of snapshots +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + SingleDeleteAcrossSnapshotBoundary) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, 2 /*last_committed_seq*/); +} + +// Single delete should be kept in case it is not visible to the +// earliest write conflict snapshot. If a single delete is kept for this reason, +// corresponding value can be trimmed to save space. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + +// Compaction filter should keep uncommitted key as-is, and +// * Convert the latest velue to deletion, and/or +// * if latest value is a merge, apply filter to all suequent merges. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)}, + {"v2", "v1", "v3", "v4"}, + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)}, + {"v2", "", "v3", ""}, 1 /*last_committed_seq*/, + nullptr /*merge_operator*/, compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) { + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeDeletion), + test::KeyStr("a", 1, kTypeDeletion)}, + {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/, + compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + CompactionFilter_PartialMerge) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeMerge)}, + {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"}, + 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)}, + {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(), + compaction_filter.get()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc new file mode 100644 index 000000000..576ec7b45 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.cc @@ -0,0 +1,1700 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include <cinttypes> +#include <functional> +#include <list> +#include <memory> +#include <random> +#include <set> +#include <thread> +#include <utility> +#include <vector> + +#include "db/builder.h" +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/range_del_aggregator.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetCompactionReasonString(CompactionReason compaction_reason) { + switch (compaction_reason) { + case CompactionReason::kUnknown: + return "Unknown"; + case CompactionReason::kLevelL0FilesNum: + return "LevelL0FilesNum"; + case CompactionReason::kLevelMaxLevelSize: + return "LevelMaxLevelSize"; + case CompactionReason::kUniversalSizeAmplification: + return "UniversalSizeAmplification"; + case CompactionReason::kUniversalSizeRatio: + return "UniversalSizeRatio"; + case CompactionReason::kUniversalSortedRunNum: + return "UniversalSortedRunNum"; + case CompactionReason::kFIFOMaxSize: + return "FIFOMaxSize"; + case CompactionReason::kFIFOReduceNumFiles: + return "FIFOReduceNumFiles"; + case CompactionReason::kFIFOTtl: + return "FIFOTtl"; + case CompactionReason::kManualCompaction: + return "ManualCompaction"; + case CompactionReason::kFilesMarkedForCompaction: + return "FilesMarkedForCompaction"; + case CompactionReason::kBottommostFiles: + return "BottommostFiles"; + case CompactionReason::kTtl: + return "Ttl"; + case CompactionReason::kFlush: + return "Flush"; + case CompactionReason::kExternalSstIngestion: + return "ExternalSstIngestion"; + case CompactionReason::kPeriodicCompaction: + return "PeriodicCompaction"; + case CompactionReason::kNumOfReasons: + // fall through + default: + assert(false); + return "Invalid"; + } +} + +// Maintains state for each sub-compaction +struct CompactionJob::SubcompactionState { + const Compaction* compaction; + std::unique_ptr<CompactionIterator> c_iter; + + // The boundaries of the key-range this compaction is interested in. No two + // subcompactions may have overlapping key-ranges. + // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded + Slice *start, *end; + + // The return status of this subcompaction + Status status; + + // Files produced by this subcompaction + struct Output { + FileMetaData meta; + bool finished; + std::shared_ptr<const TableProperties> table_properties; + }; + + // State kept for output being generated + std::vector<Output> outputs; + std::unique_ptr<WritableFileWriter> outfile; + std::unique_ptr<TableBuilder> builder; + Output* current_output() { + if (outputs.empty()) { + // This subcompaction's outptut could be empty if compaction was aborted + // before this subcompaction had a chance to generate any output files. + // When subcompactions are executed sequentially this is more likely and + // will be particulalry likely for the later subcompactions to be empty. + // Once they are run in parallel however it should be much rarer. + return nullptr; + } else { + return &outputs.back(); + } + } + + uint64_t current_output_file_size; + + // State during the subcompaction + uint64_t total_bytes; + uint64_t num_output_records; + CompactionJobStats compaction_job_stats; + uint64_t approx_size; + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index = 0; + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t overlapped_bytes = 0; + // A flag determine whether the key has been seen in ShouldStopBefore() + bool seen_key = false; + + SubcompactionState(Compaction* c, Slice* _start, Slice* _end, + uint64_t size = 0) + : compaction(c), + start(_start), + end(_end), + outfile(nullptr), + builder(nullptr), + current_output_file_size(0), + total_bytes(0), + num_output_records(0), + approx_size(size), + grandparent_index(0), + overlapped_bytes(0), + seen_key(false) { + assert(compaction != nullptr); + } + + SubcompactionState(SubcompactionState&& o) { *this = std::move(o); } + + SubcompactionState& operator=(SubcompactionState&& o) { + compaction = std::move(o.compaction); + start = std::move(o.start); + end = std::move(o.end); + status = std::move(o.status); + outputs = std::move(o.outputs); + outfile = std::move(o.outfile); + builder = std::move(o.builder); + current_output_file_size = std::move(o.current_output_file_size); + total_bytes = std::move(o.total_bytes); + num_output_records = std::move(o.num_output_records); + compaction_job_stats = std::move(o.compaction_job_stats); + approx_size = std::move(o.approx_size); + grandparent_index = std::move(o.grandparent_index); + overlapped_bytes = std::move(o.overlapped_bytes); + seen_key = std::move(o.seen_key); + return *this; + } + + // Because member std::unique_ptrs do not have these. + SubcompactionState(const SubcompactionState&) = delete; + + SubcompactionState& operator=(const SubcompactionState&) = delete; + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) { + const InternalKeyComparator* icmp = + &compaction->column_family_data()->internal_comparator(); + const std::vector<FileMetaData*>& grandparents = compaction->grandparents(); + + // Scan to find earliest grandparent file that contains key. + while (grandparent_index < grandparents.size() && + icmp->Compare(internal_key, + grandparents[grandparent_index]->largest.Encode()) > + 0) { + if (seen_key) { + overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize(); + } + assert(grandparent_index + 1 >= grandparents.size() || + icmp->Compare( + grandparents[grandparent_index]->largest.Encode(), + grandparents[grandparent_index + 1]->smallest.Encode()) <= 0); + grandparent_index++; + } + seen_key = true; + + if (overlapped_bytes + curr_file_size > + compaction->max_compaction_bytes()) { + // Too much overlap for current output; start new output + overlapped_bytes = 0; + return true; + } + + return false; + } +}; + +// Maintains state for the entire compaction +struct CompactionJob::CompactionState { + Compaction* const compaction; + + // REQUIRED: subcompaction states are stored in order of increasing + // key-range + std::vector<CompactionJob::SubcompactionState> sub_compact_states; + Status status; + + uint64_t total_bytes; + uint64_t num_output_records; + + explicit CompactionState(Compaction* c) + : compaction(c), + total_bytes(0), + num_output_records(0) {} + + size_t NumOutputFiles() { + size_t total = 0; + for (auto& s : sub_compact_states) { + total += s.outputs.size(); + } + return total; + } + + Slice SmallestUserKey() { + for (const auto& sub_compact_state : sub_compact_states) { + if (!sub_compact_state.outputs.empty() && + sub_compact_state.outputs[0].finished) { + return sub_compact_state.outputs[0].meta.smallest.user_key(); + } + } + // If there is no finished output, return an empty slice. + return Slice(nullptr, 0); + } + + Slice LargestUserKey() { + for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); + ++it) { + if (!it->outputs.empty() && it->current_output()->finished) { + assert(it->current_output() != nullptr); + return it->current_output()->meta.largest.user_key(); + } + } + // If there is no finished output, return an empty slice. + return Slice(nullptr, 0); + } +}; + +void CompactionJob::AggregateStatistics() { + for (SubcompactionState& sc : compact_->sub_compact_states) { + compact_->total_bytes += sc.total_bytes; + compact_->num_output_records += sc.num_output_records; + } + if (compaction_job_stats_) { + for (SubcompactionState& sc : compact_->sub_compact_states) { + compaction_job_stats_->Add(sc.compaction_job_stats); + } + } +} + +CompactionJob::CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic<bool>* shutting_down, + const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, + Directory* db_directory, Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector<SequenceNumber> existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache, + EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused) + : job_id_(job_id), + compact_(new CompactionState(compaction)), + compaction_job_stats_(compaction_job_stats), + compaction_stats_(compaction->compaction_reason(), 1), + dbname_(dbname), + db_options_(db_options), + file_options_(file_options), + env_(db_options.env), + fs_(db_options.fs.get()), + file_options_for_read_( + fs_->OptimizeForCompactionTableRead(file_options, db_options_)), + versions_(versions), + shutting_down_(shutting_down), + manual_compaction_paused_(manual_compaction_paused), + preserve_deletes_seqnum_(preserve_deletes_seqnum), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_directory_(output_directory), + stats_(stats), + db_mutex_(db_mutex), + db_error_handler_(db_error_handler), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + table_cache_(std::move(table_cache)), + event_logger_(event_logger), + bottommost_level_(false), + paranoid_file_checks_(paranoid_file_checks), + measure_io_stats_(measure_io_stats), + write_hint_(Env::WLTH_NOT_SET), + thread_pri_(thread_pri) { + assert(log_buffer_ != nullptr); + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + ReportStartedCompaction(compaction); +} + +CompactionJob::~CompactionJob() { + assert(compact_ == nullptr); + ThreadStatusUtil::ResetThreadStatus(); +} + +void CompactionJob::ReportStartedCompaction(Compaction* compaction) { + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_id_); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, + (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) + + compact_->compaction->output_level()); + + // In the current design, a CompactionJob is always created + // for non-trivial compaction. + assert(compaction->IsTrivialMove() == false || + compaction->is_manual_compaction() == true); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_PROP_FLAGS, + compaction->is_manual_compaction() + + (compaction->deletion_compaction() << 1)); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, + compaction->CalculateTotalInputSize()); + + IOSTATS_RESET(bytes_written); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, 0); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, 0); + + // Set the thread operation after operation properties + // to ensure GetThreadList() can always show them all together. + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + if (compaction_job_stats_) { + compaction_job_stats_->is_manual_compaction = + compaction->is_manual_compaction(); + } +} + +void CompactionJob::Prepare() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PREPARE); + + // Generate file_levels_ for compaction berfore making Iterator + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + if (c->ShouldFormSubcompactions()) { + { + StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME); + GenSubcompactionBoundaries(); + } + assert(sizes_.size() == boundaries_.size() + 1); + + for (size_t i = 0; i <= boundaries_.size(); i++) { + Slice* start = i == 0 ? nullptr : &boundaries_[i - 1]; + Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i]; + compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]); + } + RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, + compact_->sub_compact_states.size()); + } else { + compact_->sub_compact_states.emplace_back(c, nullptr, nullptr); + } +} + +struct RangeWithSize { + Range range; + uint64_t size; + + RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0) + : range(a, b), size(s) {} +}; + +void CompactionJob::GenSubcompactionBoundaries() { + auto* c = compact_->compaction; + auto* cfd = c->column_family_data(); + const Comparator* cfd_comparator = cfd->user_comparator(); + std::vector<Slice> bounds; + int start_lvl = c->start_level(); + int out_lvl = c->output_level(); + + // Add the starting and/or ending key of certain input files as a potential + // boundary + for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) { + int lvl = c->level(lvl_idx); + if (lvl >= start_lvl && lvl <= out_lvl) { + const LevelFilesBrief* flevel = c->input_levels(lvl_idx); + size_t num_files = flevel->num_files; + + if (num_files == 0) { + continue; + } + + if (lvl == 0) { + // For level 0 add the starting and ending key of each file since the + // files may have greatly differing key ranges (not range-partitioned) + for (size_t i = 0; i < num_files; i++) { + bounds.emplace_back(flevel->files[i].smallest_key); + bounds.emplace_back(flevel->files[i].largest_key); + } + } else { + // For all other levels add the smallest/largest key in the level to + // encompass the range covered by that level + bounds.emplace_back(flevel->files[0].smallest_key); + bounds.emplace_back(flevel->files[num_files - 1].largest_key); + if (lvl == out_lvl) { + // For the last level include the starting keys of all files since + // the last level is the largest and probably has the widest key + // range. Since it's range partitioned, the ending key of one file + // and the starting key of the next are very close (or identical). + for (size_t i = 1; i < num_files; i++) { + bounds.emplace_back(flevel->files[i].smallest_key); + } + } + } + } + } + + std::sort(bounds.begin(), bounds.end(), + [cfd_comparator](const Slice& a, const Slice& b) -> bool { + return cfd_comparator->Compare(ExtractUserKey(a), + ExtractUserKey(b)) < 0; + }); + // Remove duplicated entries from bounds + bounds.erase( + std::unique(bounds.begin(), bounds.end(), + [cfd_comparator](const Slice& a, const Slice& b) -> bool { + return cfd_comparator->Compare(ExtractUserKey(a), + ExtractUserKey(b)) == 0; + }), + bounds.end()); + + // Combine consecutive pairs of boundaries into ranges with an approximate + // size of data covered by keys in that range + uint64_t sum = 0; + std::vector<RangeWithSize> ranges; + // Get input version from CompactionState since it's already referenced + // earlier in SetInputVersioCompaction::SetInputVersion and will not change + // when db_mutex_ is released below + auto* v = compact_->compaction->input_version(); + for (auto it = bounds.begin();;) { + const Slice a = *it; + ++it; + + if (it == bounds.end()) { + break; + } + + const Slice b = *it; + + // ApproximateSize could potentially create table reader iterator to seek + // to the index block and may incur I/O cost in the process. Unlock db + // mutex to reduce contention + db_mutex_->Unlock(); + uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a, + b, start_lvl, out_lvl + 1, + TableReaderCaller::kCompaction); + db_mutex_->Lock(); + ranges.emplace_back(a, b, size); + sum += size; + } + + // Group the ranges into subcompactions + const double min_file_fill_percent = 4.0 / 5; + int base_level = v->storage_info()->base_level(); + uint64_t max_output_files = static_cast<uint64_t>(std::ceil( + sum / min_file_fill_percent / + MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl, + c->immutable_cf_options()->compaction_style, base_level, + c->immutable_cf_options()->level_compaction_dynamic_level_bytes))); + uint64_t subcompactions = + std::min({static_cast<uint64_t>(ranges.size()), + static_cast<uint64_t>(c->max_subcompactions()), + max_output_files}); + + if (subcompactions > 1) { + double mean = sum * 1.0 / subcompactions; + // Greedily add ranges to the subcompaction until the sum of the ranges' + // sizes becomes >= the expected mean size of a subcompaction + sum = 0; + for (size_t i = 0; i < ranges.size() - 1; i++) { + sum += ranges[i].size; + if (subcompactions == 1) { + // If there's only one left to schedule then it goes to the end so no + // need to put an end boundary + continue; + } + if (sum >= mean) { + boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit)); + sizes_.emplace_back(sum); + subcompactions--; + sum = 0; + } + } + sizes_.emplace_back(sum + ranges.back().size); + } else { + // Only one range so its size is the total sum of sizes computed above + sizes_.emplace_back(sum); + } +} + +Status CompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + TEST_SYNC_POINT("CompactionJob::Run():Start"); + log_buffer_->FlushBufferToLog(); + LogCompaction(); + + const size_t num_threads = compact_->sub_compact_states.size(); + assert(num_threads > 0); + const uint64_t start_micros = env_->NowMicros(); + + // Launch a thread for each of subcompactions 1...num_threads-1 + std::vector<port::Thread> thread_pool; + thread_pool.reserve(num_threads - 1); + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this, + &compact_->sub_compact_states[i]); + } + + // Always schedule the first subcompaction (whether or not there are also + // others) in the current thread to be efficient with resources + ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + + // Wait for all other threads (if there are any) to finish execution + for (auto& thread : thread_pool) { + thread.join(); + } + + compaction_stats_.micros = env_->NowMicros() - start_micros; + compaction_stats_.cpu_micros = 0; + for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) { + compaction_stats_.cpu_micros += + compact_->sub_compact_states[i].compaction_job_stats.cpu_micros; + } + + RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.cpu_micros); + + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + + // Check if any thread encountered an error during execution + Status status; + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + + if (status.ok() && output_directory_) { + status = output_directory_->Fsync(); + } + + if (status.ok()) { + thread_pool.clear(); + std::vector<const FileMetaData*> files_meta; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.outputs) { + files_meta.emplace_back(&output.meta); + } + } + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + auto prefix_extractor = + compact_->compaction->mutable_cf_options()->prefix_extractor.get(); + std::atomic<size_t> next_file_meta_idx(0); + auto verify_table = [&](Status& output_status) { + while (true) { + size_t file_idx = next_file_meta_idx.fetch_add(1); + if (file_idx >= files_meta.size()) { + break; + } + // Verify that the table is usable + // We set for_compaction to false and don't OptimizeForCompactionTableRead + // here because this is a special case after we finish the table building + // No matter whether use_direct_io_for_flush_and_compaction is true, + // we will regard this verification as user reads since the goal is + // to cache it here for further user reads + InternalIterator* iter = cfd->table_cache()->NewIterator( + ReadOptions(), file_options_, cfd->internal_comparator(), + *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + /*table_reader_ptr=*/nullptr, + cfd->internal_stats()->GetFileReadHist( + compact_->compaction->output_level()), + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); + auto s = iter->status(); + + if (s.ok() && paranoid_file_checks_) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {} + s = iter->status(); + } + + delete iter; + + if (!s.ok()) { + output_status = s; + break; + } + } + }; + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(verify_table, + std::ref(compact_->sub_compact_states[i].status)); + } + verify_table(compact_->sub_compact_states[0].status); + for (auto& thread : thread_pool) { + thread.join(); + } + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + } + + TablePropertiesCollection tp; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.outputs) { + auto fn = + TableFileName(state.compaction->immutable_cf_options()->cf_paths, + output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); + tp[fn] = output.table_properties; + } + } + compact_->compaction->SetOutputTableProperties(std::move(tp)); + + // Finish up all book-keeping to unify the subcompaction results + AggregateStatistics(); + UpdateCompactionStats(); + RecordCompactionIOStats(); + LogFlush(db_options_.info_log); + TEST_SYNC_POINT("CompactionJob::Run():End"); + + compact_->status = status; + return status; +} + +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_INSTALL); + db_mutex_->AssertHeld(); + Status status = compact_->status; + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + cfd->internal_stats()->AddCompactionStats( + compact_->compaction->output_level(), thread_pri_, compaction_stats_); + + if (status.ok()) { + status = InstallCompactionResults(mutable_cf_options); + } + VersionStorageInfo::LevelSummaryStorage tmp; + auto vstorage = cfd->current()->storage_info(); + const auto& stats = compaction_stats_; + + double read_write_amp = 0.0; + double write_amp = 0.0; + double bytes_read_per_sec = 0; + double bytes_written_per_sec = 0; + + if (stats.bytes_read_non_output_levels > 0) { + read_write_amp = (stats.bytes_written + stats.bytes_read_output_level + + stats.bytes_read_non_output_levels) / + static_cast<double>(stats.bytes_read_non_output_levels); + write_amp = stats.bytes_written / + static_cast<double>(stats.bytes_read_non_output_levels); + } + if (stats.micros > 0) { + bytes_read_per_sec = + (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) / + static_cast<double>(stats.micros); + bytes_written_per_sec = + stats.bytes_written / static_cast<double>(stats.micros); + } + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d) " + "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " + "write-amplify(%.1f) %s, records in: %" PRIu64 + ", records dropped: %" PRIu64 " output_compression: %s\n", + cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec, + bytes_written_per_sec, compact_->compaction->output_level(), + stats.num_input_files_in_non_output_levels, + stats.num_input_files_in_output_level, stats.num_output_files, + stats.bytes_read_non_output_levels / 1048576.0, + stats.bytes_read_output_level / 1048576.0, + stats.bytes_written / 1048576.0, read_write_amp, write_amp, + status.ToString().c_str(), stats.num_input_records, + stats.num_dropped_records, + CompressionTypeToString(compact_->compaction->output_compression()) + .c_str()); + + UpdateCompactionJobStats(stats); + + auto stream = event_logger_->LogToBuffer(log_buffer_); + stream << "job" << job_id_ << "event" + << "compaction_finished" + << "compaction_time_micros" << stats.micros + << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" + << compact_->compaction->output_level() << "num_output_files" + << compact_->NumOutputFiles() << "total_output_size" + << compact_->total_bytes << "num_input_records" + << stats.num_input_records << "num_output_records" + << compact_->num_output_records << "num_subcompactions" + << compact_->sub_compact_states.size() << "output_compression" + << CompressionTypeToString(compact_->compaction->output_compression()); + + if (compaction_job_stats_ != nullptr) { + stream << "num_single_delete_mismatches" + << compaction_job_stats_->num_single_del_mismatch; + stream << "num_single_delete_fallthrough" + << compaction_job_stats_->num_single_del_fallthru; + } + + if (measure_io_stats_ && compaction_job_stats_ != nullptr) { + stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; + stream << "file_range_sync_nanos" + << compaction_job_stats_->file_range_sync_nanos; + stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos; + stream << "file_prepare_write_nanos" + << compaction_job_stats_->file_prepare_write_nanos; + } + + stream << "lsm_state"; + stream.StartArray(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + CleanupCompaction(); + return status; +} + +void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { + assert(sub_compact != nullptr); + + uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000; + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + + // Create compaction filter and fail the compaction if + // IgnoreSnapshots() = false because it is not supported anymore + const CompactionFilter* compaction_filter = + cfd->ioptions()->compaction_filter; + std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr; + if (compaction_filter == nullptr) { + compaction_filter_from_factory = + sub_compact->compaction->CreateCompactionFilter(); + compaction_filter = compaction_filter_from_factory.get(); + } + if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) { + sub_compact->status = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return; + } + + CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(), + existing_snapshots_); + + // Although the v2 aggregator is what the level iterator(s) know about, + // the AddTombstones calls will be propagated down to the v1 aggregator. + std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator( + sub_compact->compaction, &range_del_agg, file_options_for_read_)); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PROCESS_KV); + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + const uint64_t kRecordStatsEvery = 1000; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + MergeHelper merge( + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator, + compaction_filter, db_options_.info_log.get(), + false /* internal key corruption is expected */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_, compact_->compaction->level(), + db_options_.statistics.get()); + + TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:1", + reinterpret_cast<void*>( + const_cast<std::atomic<bool>*>(manual_compaction_paused_))); + + Slice* start = sub_compact->start; + Slice* end = sub_compact->end; + if (start != nullptr) { + IterKey start_iter; + start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); + input->Seek(start_iter.GetInternalKey()); + } else { + input->SeekToFirst(); + } + + Status status; + sub_compact->c_iter.reset(new CompactionIterator( + input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), + &existing_snapshots_, earliest_write_conflict_snapshot_, + snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, + &range_del_agg, sub_compact->compaction, compaction_filter, + shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_, + db_options_.info_log)); + auto c_iter = sub_compact->c_iter.get(); + c_iter->SeekToFirst(); + if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { + // ShouldStopBefore() maintains state based on keys processed so far. The + // compaction loop always calls it on the "next" key, thus won't tell it the + // first key. So we do that here. + sub_compact->ShouldStopBefore(c_iter->key(), + sub_compact->current_output_file_size); + } + const auto& c_iter_stats = c_iter->iter_stats(); + + while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { + // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() + // returns true. + const Slice& key = c_iter->key(); + const Slice& value = c_iter->value(); + + // If an end key (exclusive) is specified, check if the current key is + // >= than it and exit if it is because the iterator is out of its range + if (end != nullptr && + cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) { + break; + } + if (c_iter_stats.num_input_records % kRecordStatsEvery == + kRecordStatsEvery - 1) { + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + c_iter->ResetRecordCounts(); + RecordCompactionIOStats(); + } + + // Open output file if necessary + if (sub_compact->builder == nullptr) { + status = OpenCompactionOutputFile(sub_compact); + if (!status.ok()) { + break; + } + } + assert(sub_compact->builder != nullptr); + assert(sub_compact->current_output() != nullptr); + sub_compact->builder->Add(key, value); + sub_compact->current_output_file_size = sub_compact->builder->FileSize(); + const ParsedInternalKey& ikey = c_iter->ikey(); + sub_compact->current_output()->meta.UpdateBoundaries( + key, value, ikey.sequence, ikey.type); + sub_compact->num_output_records++; + + // Close output file if it is big enough. Two possibilities determine it's + // time to close it: (1) the current key should be this file's last key, (2) + // the next key should not be in this file. + // + // TODO(aekmekji): determine if file should be closed earlier than this + // during subcompactions (i.e. if output size, estimated by input size, is + // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB + // and 0.6MB instead of 1MB and 0.2MB) + bool output_file_ended = false; + Status input_status; + if (sub_compact->compaction->output_level() != 0 && + sub_compact->current_output_file_size >= + sub_compact->compaction->max_output_file_size()) { + // (1) this key terminates the file. For historical reasons, the iterator + // status before advancing will be given to FinishCompactionOutputFile(). + input_status = input->status(); + output_file_ended = true; + } + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:2", + reinterpret_cast<void*>( + const_cast<std::atomic<bool>*>(manual_compaction_paused_))); + c_iter->Next(); + if (c_iter->status().IsManualCompactionPaused()) { + break; + } + if (!output_file_ended && c_iter->Valid() && + sub_compact->compaction->output_level() != 0 && + sub_compact->ShouldStopBefore(c_iter->key(), + sub_compact->current_output_file_size) && + sub_compact->builder != nullptr) { + // (2) this key belongs to the next file. For historical reasons, the + // iterator status after advancing will be given to + // FinishCompactionOutputFile(). + input_status = input->status(); + output_file_ended = true; + } + if (output_file_ended) { + const Slice* next_key = nullptr; + if (c_iter->Valid()) { + next_key = &c_iter->key(); + } + CompactionIterationStats range_del_out_stats; + status = + FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg, + &range_del_out_stats, next_key); + RecordDroppedKeys(range_del_out_stats, + &sub_compact->compaction_job_stats); + } + } + + sub_compact->compaction_job_stats.num_input_deletion_records = + c_iter_stats.num_input_deletion_records; + sub_compact->compaction_job_stats.num_corrupt_keys = + c_iter_stats.num_input_corrupt_records; + sub_compact->compaction_job_stats.num_single_del_fallthru = + c_iter_stats.num_single_del_fallthru; + sub_compact->compaction_job_stats.num_single_del_mismatch = + c_iter_stats.num_single_del_mismatch; + sub_compact->compaction_job_stats.total_input_raw_key_bytes += + c_iter_stats.total_input_raw_key_bytes; + sub_compact->compaction_job_stats.total_input_raw_value_bytes += + c_iter_stats.total_input_raw_value_bytes; + + RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, + c_iter_stats.total_filter_time); + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + RecordCompactionIOStats(); + + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + (manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed))) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (status.ok()) { + status = input->status(); + } + if (status.ok()) { + status = c_iter->status(); + } + + if (status.ok() && sub_compact->builder == nullptr && + sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) { + // handle subcompaction containing only range deletions + status = OpenCompactionOutputFile(sub_compact); + } + + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output file. + if (sub_compact->builder != nullptr) { + CompactionIterationStats range_del_out_stats; + Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg, + &range_del_out_stats); + if (status.ok()) { + status = s; + } + RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); + } + + sub_compact->compaction_job_stats.cpu_micros = + env_->NowCPUNanos() / 1000 - prev_cpu_micros; + + if (measure_io_stats_) { + sub_compact->compaction_job_stats.file_write_nanos += + IOSTATS(write_nanos) - prev_write_nanos; + sub_compact->compaction_job_stats.file_fsync_nanos += + IOSTATS(fsync_nanos) - prev_fsync_nanos; + sub_compact->compaction_job_stats.file_range_sync_nanos += + IOSTATS(range_sync_nanos) - prev_range_sync_nanos; + sub_compact->compaction_job_stats.file_prepare_write_nanos += + IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; + sub_compact->compaction_job_stats.cpu_micros -= + (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos + + IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) / + 1000; + if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { + SetPerfLevel(prev_perf_level); + } + } + + sub_compact->c_iter.reset(); + input.reset(); + sub_compact->status = status; +} + +void CompactionJob::RecordDroppedKeys( + const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats) { + if (c_iter_stats.num_record_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, + c_iter_stats.num_record_drop_user); + } + if (c_iter_stats.num_record_drop_hidden > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + c_iter_stats.num_record_drop_hidden); + if (compaction_job_stats) { + compaction_job_stats->num_records_replaced += + c_iter_stats.num_record_drop_hidden; + } + } + if (c_iter_stats.num_record_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, + c_iter_stats.num_record_drop_obsolete); + if (compaction_job_stats) { + compaction_job_stats->num_expired_deletion_records += + c_iter_stats.num_record_drop_obsolete; + } + } + if (c_iter_stats.num_record_drop_range_del > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL, + c_iter_stats.num_record_drop_range_del); + } + if (c_iter_stats.num_range_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE, + c_iter_stats.num_range_del_drop_obsolete); + } + if (c_iter_stats.num_optimized_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + c_iter_stats.num_optimized_del_drop_obsolete); + } +} + +Status CompactionJob::FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionRangeDelAggregator* range_del_agg, + CompactionIterationStats* range_del_out_stats, + const Slice* next_table_min_key /* = nullptr */) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_SYNC_FILE); + assert(sub_compact != nullptr); + assert(sub_compact->outfile); + assert(sub_compact->builder != nullptr); + assert(sub_compact->current_output() != nullptr); + + uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber(); + assert(output_number != 0); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + const Comparator* ucmp = cfd->user_comparator(); + + // Check for iterator errors + Status s = input_status; + auto meta = &sub_compact->current_output()->meta; + assert(meta != nullptr); + if (s.ok()) { + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; + if (sub_compact->outputs.size() == 1) { + // For the first output table, include range tombstones before the min key + // but after the subcompaction boundary. + lower_bound = sub_compact->start; + lower_bound_from_sub_compact = true; + } else if (meta->smallest.size() > 0) { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/); + lower_bound_guard = Slice(smallest_user_key); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + if (next_table_min_key != nullptr) { + // This may be the last file in the subcompaction in some cases, so we + // need to compare the end key of subcompaction with the next file start + // key. When the end key is chosen by the subcompaction, we know that + // it must be the biggest key in output file. Therefore, it is safe to + // use the smaller key as the upper bound of the output file, to ensure + // that there is no overlapping between different output files. + upper_bound_guard = ExtractUserKey(*next_table_min_key); + if (sub_compact->end != nullptr && + ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) { + upper_bound = sub_compact->end; + } else { + upper_bound = &upper_bound_guard; + } + } else { + // This is the last file in the subcompaction, so extend until the + // subcompaction ends. + upper_bound = sub_compact->end; + } + auto earliest_snapshot = kMaxSequenceNumber; + if (existing_snapshots_.size() > 0) { + earliest_snapshot = existing_snapshots_[0]; + } + bool has_overlapping_endpoints; + if (upper_bound != nullptr && meta->largest.size() > 0) { + has_overlapping_endpoints = + ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0; + } else { + has_overlapping_endpoints = false; + } + + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(sub_compact->end == nullptr || + upper_bound == nullptr || + ucmp->Compare(*upper_bound , *sub_compact->end) <= 0); + auto it = range_del_agg->NewIterator(lower_bound, upper_bound, + has_overlapping_endpoints); + // Position the range tombstone output iterator. There may be tombstone + // fragments that are entirely out of range, so make sure that we do not + // include those. + if (lower_bound != nullptr) { + it->Seek(*lower_bound); + } else { + it->SeekToFirst(); + } + for (; it->Valid(); it->Next()) { + auto tombstone = it->Tombstone(); + if (upper_bound != nullptr) { + int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_); + if ((has_overlapping_endpoints && cmp < 0) || + (!has_overlapping_endpoints && cmp <= 0)) { + // Tombstones starting after upper_bound only need to be included in + // the next table. If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range tombstones + // will be included in the next file and are not relevant to the point + // keys or endpoints of the current file. + break; + } + } + + if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) { + // TODO(andrewkr): tombstones that span multiple output files are + // counted for each compaction output file, so lots of double counting. + range_del_out_stats->num_range_del_drop_obsolete++; + range_del_out_stats->num_record_drop_obsolete++; + continue; + } + + auto kv = tombstone.Serialize(); + assert(lower_bound == nullptr || + ucmp->Compare(*lower_bound, kv.second) < 0); + sub_compact->builder->Add(kv.first.Encode(), kv.second); + InternalKey smallest_candidate = std::move(kv.first); + if (lower_bound != nullptr && + ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) { + // Pretend the smallest key has the same user key as lower_bound + // (the max key in the previous table or subcompaction) in order for + // files to appear key-space partitioned. + // + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, because + // otherwise the subcompaction woud be unbounded on the left. As a + // result, we know that no other files on the output level will contain + // actual keys at lower_bound (an output file may have a largest key of + // lower_bound@kMaxSequenceNumber, but this only indicates a large range + // tombstone was truncated). Therefore, it is safe to use the + // tombstone's sequence number, to ensure that keys at lower_bound at + // lower levels are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes after + // the previous file's largest. The fake seqnum is OK because the read + // path's file-picking code only considers user key. + smallest_candidate = InternalKey( + *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0, + kTypeRangeDeletion); + } + InternalKey largest_candidate = tombstone.SerializeEndKey(); + if (upper_bound != nullptr && + ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) { + // Pretend the largest key has the same user key as upper_bound (the + // min key in the following table or subcompaction) in order for files + // to appear key-space partitioned. + // + // Choose highest seqnum so this file's largest internal key comes + // before the next file's/subcompaction's smallest. The fake seqnum is + // OK because the read path's file-picking code only considers the user + // key portion. + // + // Note Seek() also creates InternalKey with (user_key, + // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of + // kTypeRangeDeletion (0xF), so the range tombstone comes before the + // Seek() key in InternalKey's ordering. So Seek() will look in the + // next file for the user key. + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); + } +#ifndef NDEBUG + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta->smallest.size() > 0) { + smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode()); + } +#endif + meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, + tombstone.seq_, + cfd->internal_comparator()); + + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta->smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); + } + meta->marked_for_compaction = sub_compact->builder->NeedCompact(); + } + const uint64_t current_entries = sub_compact->builder->NumEntries(); + if (s.ok()) { + s = sub_compact->builder->Finish(); + } else { + sub_compact->builder->Abandon(); + } + const uint64_t current_bytes = sub_compact->builder->FileSize(); + if (s.ok()) { + // Add the checksum information to file metadata. + meta->file_checksum = sub_compact->builder->GetFileChecksum(); + meta->file_checksum_func_name = + sub_compact->builder->GetFileChecksumFuncName(); + + meta->fd.file_size = current_bytes; + } + sub_compact->current_output()->finished = true; + sub_compact->total_bytes += current_bytes; + + // Finish and check for file errors + if (s.ok()) { + StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + s = sub_compact->outfile->Sync(db_options_.use_fsync); + } + if (s.ok()) { + s = sub_compact->outfile->Close(); + } + sub_compact->outfile.reset(); + + TableProperties tp; + if (s.ok()) { + tp = sub_compact->builder->GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { + // If there is nothing to output, no necessary to generate a sst file. + // This happens when the output level is bottom level, at the same time + // the sub_compact output nothing. + std::string fname = + TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + env_->DeleteFile(fname); + + // Also need to remove the file from outputs, or it will be added to the + // VersionEdit. + assert(!sub_compact->outputs.empty()); + sub_compact->outputs.pop_back(); + meta = nullptr; + } + + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { + // Output to event logger and fire events. + sub_compact->current_output()->table_properties = + std::make_shared<TableProperties>(tp); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 + " keys, %" PRIu64 " bytes%s", + cfd->GetName().c_str(), job_id_, output_number, + current_entries, current_bytes, + meta->marked_for_compaction ? " (need compaction)" : ""); + } + std::string fname; + FileDescriptor output_fd; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + if (meta != nullptr) { + fname = + TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + output_fd = meta->fd; + oldest_blob_file_number = meta->oldest_blob_file_number; + } else { + fname = "(nil)"; + } + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, + job_id_, output_fd, oldest_blob_file_number, tp, + TableFileCreationReason::kCompaction, s); + +#ifndef ROCKSDB_LITE + // Report new file to SstFileManagerImpl + auto sfm = + static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get()); + if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) { + sfm->OnAddFile(fname); + if (sfm->IsMaxAllowedSpaceReached()) { + // TODO(ajkr): should we return OK() if max space was reached by the final + // compaction output file (similarly to how flush works when full)? + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:" + "MaxAllowedSpaceReached"); + InstrumentedMutexLock l(db_mutex_); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); + } + } +#endif + + sub_compact->builder.reset(); + sub_compact->current_output_file_size = 0; + return s; +} + +Status CompactionJob::InstallCompactionResults( + const MutableCFOptions& mutable_cf_options) { + db_mutex_->AssertHeld(); + + auto* compaction = compact_->compaction; + // paranoia: verify that the files that we started with + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact_. + if (!versions_->VerifyCompactionFileConsistency(compaction)) { + Compaction::InputLevelSummaryBuffer inputs_summary; + + ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted", + compaction->column_family_data()->GetName().c_str(), + job_id_, compaction->InputLevelSummary(&inputs_summary)); + return Status::Corruption("Compaction input files inconsistent"); + } + + { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes); + } + + // Add compaction inputs + compaction->AddInputDeletions(compact_->compaction->edit()); + + for (const auto& sub_compact : compact_->sub_compact_states) { + for (const auto& out : sub_compact.outputs) { + compaction->edit()->AddFile(compaction->output_level(), out.meta); + } + } + return versions_->LogAndApply(compaction->column_family_data(), + mutable_cf_options, compaction->edit(), + db_mutex_, db_directory_); +} + +void CompactionJob::RecordCompactionIOStats() { + RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read)); + IOSTATS_RESET(bytes_read); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +Status CompactionJob::OpenCompactionOutputFile( + SubcompactionState* sub_compact) { + assert(sub_compact != nullptr); + assert(sub_compact->builder == nullptr); + // no need to lock because VersionSet::next_file_number_ is atomic + uint64_t file_number = versions_->NewFileNumber(); + std::string fname = + TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + file_number, sub_compact->compaction->output_path_id()); + // Fire events. + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, + TableFileCreationReason::kCompaction); +#endif // !ROCKSDB_LITE + // Make the output file + std::unique_ptr<FSWritableFile> writable_file; +#ifndef NDEBUG + bool syncpoint_arg = file_options_.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", + &syncpoint_arg); +#endif + Status s = NewWritableFile(fs_, fname, &writable_file, file_options_); + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 + " fails at NewWritableFile with status %s", + sub_compact->compaction->column_family_data()->GetName().c_str(), + job_id_, file_number, s.ToString().c_str()); + LogFlush(db_options_.info_log); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), + fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, + TableProperties(), TableFileCreationReason::kCompaction, s); + return s; + } + + // Try to figure out the output file's oldest ancester time. + int64_t temp_current_time = 0; + auto get_time_status = env_->GetCurrentTime(&temp_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!get_time_status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time. Status: %s", + get_time_status.ToString().c_str()); + } + uint64_t current_time = static_cast<uint64_t>(temp_current_time); + uint64_t oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime(); + if (oldest_ancester_time == port::kMaxUint64) { + oldest_ancester_time = current_time; + } + + // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + { + SubcompactionState::Output out; + out.meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + out.meta.oldest_ancester_time = oldest_ancester_time; + out.meta.file_creation_time = current_time; + out.finished = false; + sub_compact->outputs.push_back(out); + } + + writable_file->SetIOPriority(Env::IOPriority::IO_LOW); + writable_file->SetWriteLifeTimeHint(write_hint_); + writable_file->SetPreallocationBlockSize(static_cast<size_t>( + sub_compact->compaction->OutputFilePreallocationSize())); + const auto& listeners = + sub_compact->compaction->immutable_cf_options()->listeners; + sub_compact->outfile.reset( + new WritableFileWriter(std::move(writable_file), fname, file_options_, + env_, db_options_.statistics.get(), listeners, + db_options_.sst_file_checksum_func.get())); + + // If the Column family flag is to only optimize filters for hits, + // we can skip creating filters if this is the bottommost_level where + // data is going to be found + bool skip_filters = + cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; + + sub_compact->builder.reset(NewTableBuilder( + *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(), + sub_compact->compaction->output_compression(), + 0 /*sample_for_compression */, + sub_compact->compaction->output_compression_opts(), + sub_compact->compaction->output_level(), skip_filters, + oldest_ancester_time, 0 /* oldest_key_time */, + sub_compact->compaction->max_output_file_size(), current_time)); + LogFlush(db_options_.info_log); + return s; +} + +void CompactionJob::CleanupCompaction() { + for (SubcompactionState& sub_compact : compact_->sub_compact_states) { + const auto& sub_status = sub_compact.status; + + if (sub_compact.builder != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + sub_compact.builder->Abandon(); + sub_compact.builder.reset(); + } else { + assert(!sub_status.ok() || sub_compact.outfile == nullptr); + } + for (const auto& out : sub_compact.outputs) { + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + if (!sub_status.ok()) { + TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber()); + } + } + } + delete compact_; + compact_ = nullptr; +} + +#ifndef ROCKSDB_LITE +namespace { +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} +} // namespace + +#endif // !ROCKSDB_LITE + +void CompactionJob::UpdateCompactionStats() { + Compaction* compaction = compact_->compaction; + compaction_stats_.num_input_files_in_non_output_levels = 0; + compaction_stats_.num_input_files_in_output_level = 0; + for (int input_level = 0; + input_level < static_cast<int>(compaction->num_input_levels()); + ++input_level) { + if (compaction->level(input_level) != compaction->output_level()) { + UpdateCompactionInputStatsHelper( + &compaction_stats_.num_input_files_in_non_output_levels, + &compaction_stats_.bytes_read_non_output_levels, input_level); + } else { + UpdateCompactionInputStatsHelper( + &compaction_stats_.num_input_files_in_output_level, + &compaction_stats_.bytes_read_output_level, input_level); + } + } + + uint64_t num_output_records = 0; + + for (const auto& sub_compact : compact_->sub_compact_states) { + size_t num_output_files = sub_compact.outputs.size(); + if (sub_compact.builder != nullptr) { + // An error occurred so ignore the last output. + assert(num_output_files > 0); + --num_output_files; + } + compaction_stats_.num_output_files += static_cast<int>(num_output_files); + + num_output_records += sub_compact.num_output_records; + + for (const auto& out : sub_compact.outputs) { + compaction_stats_.bytes_written += out.meta.fd.file_size; + } + } + + if (compaction_stats_.num_input_records > num_output_records) { + compaction_stats_.num_dropped_records = + compaction_stats_.num_input_records - num_output_records; + } +} + +void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, + uint64_t* bytes_read, + int input_level) { + const Compaction* compaction = compact_->compaction; + auto num_input_files = compaction->num_input_files(input_level); + *num_files += static_cast<int>(num_input_files); + + for (size_t i = 0; i < num_input_files; ++i) { + const auto* file_meta = compaction->input(input_level, i); + *bytes_read += file_meta->fd.GetFileSize(); + compaction_stats_.num_input_records += + static_cast<uint64_t>(file_meta->num_entries); + } +} + +void CompactionJob::UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const { +#ifndef ROCKSDB_LITE + if (compaction_job_stats_) { + compaction_job_stats_->elapsed_micros = stats.micros; + + // input information + compaction_job_stats_->total_input_bytes = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + compaction_job_stats_->num_input_records = stats.num_input_records; + compaction_job_stats_->num_input_files = + stats.num_input_files_in_non_output_levels + + stats.num_input_files_in_output_level; + compaction_job_stats_->num_input_files_at_output_level = + stats.num_input_files_in_output_level; + + // output information + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->num_output_records = compact_->num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + + if (compact_->NumOutputFiles() > 0U) { + CopyPrefix(compact_->SmallestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->smallest_output_key_prefix); + CopyPrefix(compact_->LargestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->largest_output_key_prefix); + } + } +#else + (void)stats; +#endif // !ROCKSDB_LITE +} + +void CompactionJob::LogCompaction() { + Compaction* compaction = compact_->compaction; + ColumnFamilyData* cfd = compaction->column_family_data(); + + // Let's check if anything will get logged. Don't prepare all the info if + // we're not logging + if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f", + cfd->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compaction->score()); + char scratch[2345]; + compaction->Summary(scratch, sizeof(scratch)); + ROCKS_LOG_INFO(db_options_.info_log, "[%s] Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + // build event logger report + auto stream = event_logger_->Log(); + stream << "job" << job_id_ << "event" + << "compaction_started" + << "compaction_reason" + << GetCompactionReasonString(compaction->compaction_reason()); + for (size_t i = 0; i < compaction->num_input_levels(); ++i) { + stream << ("files_L" + ToString(compaction->level(i))); + stream.StartArray(); + for (auto f : *compaction->inputs(i)) { + stream << f->fd.GetNumber(); + } + stream.EndArray(); + } + stream << "score" << compaction->score() << "input_data_size" + << compaction->CalculateTotalInputSize(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h new file mode 100644 index 000000000..c15f502a1 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.h @@ -0,0 +1,198 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <atomic> +#include <deque> +#include <functional> +#include <limits> +#include <set> +#include <string> +#include <utility> +#include <vector> + +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/memtable_list.h" +#include "db/range_del_aggregator.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ErrorHandler; +class MemTable; +class SnapshotChecker; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. +class CompactionJob { + public: + CompactionJob(int job_id, Compaction* compaction, + const ImmutableDBOptions& db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic<bool>* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector<SequenceNumber> existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, + std::shared_ptr<Cache> table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, + CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, + const std::atomic<bool>* manual_compaction_paused = nullptr); + + ~CompactionJob(); + + // no copy/move + CompactionJob(CompactionJob&& job) = delete; + CompactionJob(const CompactionJob& job) = delete; + CompactionJob& operator=(const CompactionJob& job) = delete; + + // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction + void Prepare(); + // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results + Status Run(); + + // REQUIRED: mutex held + // Add compaction input/output to the current version + Status Install(const MutableCFOptions& mutable_cf_options); + + private: + struct SubcompactionState; + + void AggregateStatistics(); + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. + void GenSubcompactionBoundaries(); + + // update the thread status for starting a compaction. + void ReportStartedCompaction(Compaction* compaction); + void AllocateCompactionOutputFileNumbers(); + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + + Status FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionRangeDelAggregator* range_del_agg, + CompactionIterationStats* range_del_out_stats, + const Slice* next_table_min_key = nullptr); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); + void RecordCompactionIOStats(); + Status OpenCompactionOutputFile(SubcompactionState* sub_compact); + void CleanupCompaction(); + void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const; + void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats = nullptr); + + void UpdateCompactionStats(); + void UpdateCompactionInputStatsHelper( + int* num_files, uint64_t* bytes_read, int input_level); + + void LogCompaction(); + + int job_id_; + + // CompactionJob state + struct CompactionState; + CompactionState* compact_; + CompactionJobStats* compaction_job_stats_; + InternalStats::CompactionStats compaction_stats_; + + // DBImpl state + const std::string& dbname_; + const ImmutableDBOptions& db_options_; + const FileOptions file_options_; + + Env* env_; + FileSystem* fs_; + // env_option optimized for compaction table reads + FileOptions file_options_for_read_; + VersionSet* versions_; + const std::atomic<bool>* shutting_down_; + const std::atomic<bool>* manual_compaction_paused_; + const SequenceNumber preserve_deletes_seqnum_; + LogBuffer* log_buffer_; + Directory* db_directory_; + Directory* output_directory_; + Statistics* stats_; + InstrumentedMutex* db_mutex_; + ErrorHandler* db_error_handler_; + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector<SequenceNumber> existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot_; + + const SnapshotChecker* const snapshot_checker_; + + std::shared_ptr<Cache> table_cache_; + + EventLogger* event_logger_; + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + bool paranoid_file_checks_; + bool measure_io_stats_; + // Stores the Slices that designate the boundaries for each subcompaction + std::vector<Slice> boundaries_; + // Stores the approx size of keys covered in the range of each subcompaction + std::vector<uint64_t> sizes_; + Env::WriteLifeTimeHint write_hint_; + Env::Priority thread_pri_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc new file mode 100644 index 000000000..51a665797 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc @@ -0,0 +1,1043 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include <cinttypes> +#include <iostream> +#include <mutex> +#include <queue> +#include <set> +#include <thread> +#include <unordered_set> +#include <utility> + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memtable/hash_linklist_rep.h" +#include "monitoring/statistics.h" +#include "monitoring/thread_status_util.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/plain/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/compression.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#if !defined(IOS_CROSS_COMPILE) +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { + +static std::string RandomString(Random* rnd, int len, double ratio) { + std::string r; + test::CompressibleString(rnd, ratio, len, &r); + return r; +} + +std::string Key(uint64_t key, int length) { + const int kBufSize = 1000; + char buf[kBufSize]; + if (length > kBufSize) { + length = kBufSize; + } + snprintf(buf, kBufSize, "%0*" PRIu64, length, key); + return std::string(buf); +} + +class CompactionJobStatsTest : public testing::Test, + public testing::WithParamInterface<bool> { + public: + std::string dbname_; + std::string alternative_wal_dir_; + Env* env_; + DB* db_; + std::vector<ColumnFamilyHandle*> handles_; + uint32_t max_subcompactions_; + + Options last_options_; + + CompactionJobStatsTest() : env_(Env::Default()) { + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::PerThreadDBPath("compaction_job_stats_test"); + alternative_wal_dir_ = dbname_ + "/wal"; + Options options; + options.create_if_missing = true; + max_subcompactions_ = GetParam(); + options.max_subcompactions = max_subcompactions_; + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + Reopen(options); + } + + ~CompactionJobStatsTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + EXPECT_OK(DestroyDB(dbname_, options)); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + DBImpl* dbfull() { + return reinterpret_cast<DBImpl*>(db_); + } + + void CreateColumnFamilies(const std::vector<std::string>& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector<std::string>& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector<std::string> cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const std::vector<Options>& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies( + const std::vector<std::string>& cfs, + const std::vector<Options>& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector<ColumnFamilyDescriptor> column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const Options& options) { + Close(); + std::vector<Options> v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + void Reopen(const Options& options) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(const Options& options) { + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + } + + Status ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); + } + + Status TryReopen(const Options& options) { + Close(); + last_options_ = options; + return DB::Open(options, dbname_, &db_); + } + + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + return db_->Put(wo, k, v); + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + return db_->Put(wo, handles_[cf], k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level, int cf = 0) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + &property)); + } + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) { + Range r(start, limit); + uint64_t size; + if (cf == 0) { + db_->GetApproximateSizes(&r, 1, &size); + } else { + db_->GetApproximateSizes(handles_[1], &r, 1, &size); + } + return size; + } + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + CompactRangeOptions compact_options; + compact_options.target_path_id = target_path_id; + ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); + } + + void Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit)); + } + + void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf], + true /* disallow trivial move */)); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + } + } + + static void SetDeletionCompactionStats( + CompactionJobStats *stats, uint64_t input_deletions, + uint64_t expired_deletions, uint64_t records_replaced) { + stats->num_input_deletion_records = input_deletions; + stats->num_expired_deletion_records = expired_deletions; + stats->num_records_replaced = records_replaced; + } + + void MakeTableWithKeyValues( + Random* rnd, uint64_t smallest, uint64_t largest, + int key_size, int value_size, uint64_t interval, + double ratio, int cf = 0) { + for (auto key = smallest; key < largest; key += interval) { + ASSERT_OK(Put(cf, Slice(Key(key, key_size)), + Slice(RandomString(rnd, value_size, ratio)))); + } + ASSERT_OK(Flush(cf)); + } + + // This function behaves with the implicit understanding that two + // rounds of keys are inserted into the database, as per the behavior + // of the DeletionStatsTest. + void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest, + uint64_t interval, int deletion_interval, int key_size, + uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) { + + // interval needs to be >= 2 so that deletion entries can be inserted + // that are intended to not result in an actual key deletion by using + // an offset of 1 from another existing key + ASSERT_GE(interval, 2); + + uint64_t ctr = 1; + uint32_t deletions_made = 0; + uint32_t num_deleted = 0; + uint32_t num_expired = 0; + for (auto key = smallest; key <= largest; key += interval, ctr++) { + if (ctr % deletion_interval == 0) { + ASSERT_OK(Delete(cf, Key(key, key_size))); + deletions_made++; + num_deleted++; + + if (key > cutoff_key_num) { + num_expired++; + } + } + } + + // Insert some deletions for keys that don't exist that + // are both in and out of the key range + ASSERT_OK(Delete(cf, Key(smallest+1, key_size))); + deletions_made++; + + ASSERT_OK(Delete(cf, Key(smallest-1, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Delete(cf, Key(smallest-9, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Flush(cf)); + SetDeletionCompactionStats(stats, deletions_made, num_expired, + num_deleted); + } +}; + +// An EventListener which helps verify the compaction results in +// test CompactionJobStatsTest. +class CompactionJobStatsChecker : public EventListener { + public: + CompactionJobStatsChecker() + : compression_enabled_(false), verify_next_comp_io_stats_(false) {} + + size_t NumberOfUnverifiedStats() { return expected_stats_.size(); } + + void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; } + + // Once a compaction completed, this function will verify the returned + // CompactionJobInfo with the oldest CompactionJobInfo added earlier + // in "expected_stats_" which has not yet being used for verification. + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + if (verify_next_comp_io_stats_) { + ASSERT_GT(ci.stats.file_write_nanos, 0); + ASSERT_GT(ci.stats.file_range_sync_nanos, 0); + ASSERT_GT(ci.stats.file_fsync_nanos, 0); + ASSERT_GT(ci.stats.file_prepare_write_nanos, 0); + verify_next_comp_io_stats_ = false; + } + + std::lock_guard<std::mutex> lock(mutex_); + if (expected_stats_.size()) { + Verify(ci.stats, expected_stats_.front()); + expected_stats_.pop(); + } + } + + // A helper function which verifies whether two CompactionJobStats + // match. The verification of all compaction stats are done by + // ASSERT_EQ except for the total input / output bytes, which we + // use ASSERT_GE and ASSERT_LE with a reasonable bias --- + // 10% in uncompressed case and 20% when compression is used. + virtual void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) { + // time + ASSERT_GT(current_stats.elapsed_micros, 0U); + + ASSERT_EQ(current_stats.num_input_records, + stats.num_input_records); + ASSERT_EQ(current_stats.num_input_files, + stats.num_input_files); + ASSERT_EQ(current_stats.num_input_files_at_output_level, + stats.num_input_files_at_output_level); + + ASSERT_EQ(current_stats.num_output_records, + stats.num_output_records); + ASSERT_EQ(current_stats.num_output_files, + stats.num_output_files); + + ASSERT_EQ(current_stats.is_manual_compaction, + stats.is_manual_compaction); + + // file size + double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10; + ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias), + stats.total_input_bytes); + ASSERT_LE(current_stats.total_input_bytes, + stats.total_input_bytes * (1.00 + kFileSizeBias)); + ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias), + stats.total_output_bytes); + ASSERT_LE(current_stats.total_output_bytes, + stats.total_output_bytes * (1.00 + kFileSizeBias)); + ASSERT_EQ(current_stats.total_input_raw_key_bytes, + stats.total_input_raw_key_bytes); + ASSERT_EQ(current_stats.total_input_raw_value_bytes, + stats.total_input_raw_value_bytes); + + ASSERT_EQ(current_stats.num_records_replaced, + stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, + stats.num_corrupt_keys); + + ASSERT_EQ( + std::string(current_stats.smallest_output_key_prefix), + std::string(stats.smallest_output_key_prefix)); + ASSERT_EQ( + std::string(current_stats.largest_output_key_prefix), + std::string(stats.largest_output_key_prefix)); + } + + // Add an expected compaction stats, which will be used to + // verify the CompactionJobStats returned by the OnCompactionCompleted() + // callback. + void AddExpectedStats(const CompactionJobStats& stats) { + std::lock_guard<std::mutex> lock(mutex_); + expected_stats_.push(stats); + } + + void EnableCompression(bool flag) { + compression_enabled_ = flag; + } + + bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; } + + private: + std::mutex mutex_; + std::queue<CompactionJobStats> expected_stats_; + bool compression_enabled_; + bool verify_next_comp_io_stats_; +}; + +// An EventListener which helps verify the compaction statistics in +// the test DeletionStatsTest. +class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker { + public: + // Verifies whether two CompactionJobStats match. + void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) override { + ASSERT_EQ( + current_stats.num_input_deletion_records, + stats.num_input_deletion_records); + ASSERT_EQ( + current_stats.num_expired_deletion_records, + stats.num_expired_deletion_records); + ASSERT_EQ( + current_stats.num_records_replaced, + stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, + stats.num_corrupt_keys); + } +}; + +namespace { + +uint64_t EstimatedFileSize( + uint64_t num_records, size_t key_size, size_t value_size, + double compression_ratio = 1.0, + size_t block_size = 4096, + int bloom_bits_per_key = 10) { + const size_t kPerKeyOverhead = 8; + const size_t kFooterSize = 512; + + uint64_t data_size = + static_cast<uint64_t>( + num_records * (key_size + value_size * compression_ratio + + kPerKeyOverhead)); + + return data_size + kFooterSize + + num_records * bloom_bits_per_key / 8 // filter block + + data_size * (key_size + 8) / block_size; // index block +} + +namespace { + +void CopyPrefix( + const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} + +} // namespace + +CompactionJobStats NewManualCompactionJobStats( + const std::string& smallest_key, const std::string& largest_key, + size_t num_input_files, size_t num_input_files_at_output_level, + uint64_t num_input_records, size_t key_size, size_t value_size, + size_t num_output_files, uint64_t num_output_records, + double compression_ratio, uint64_t num_records_replaced, + bool is_manual = true) { + CompactionJobStats stats; + stats.Reset(); + + stats.num_input_records = num_input_records; + stats.num_input_files = num_input_files; + stats.num_input_files_at_output_level = num_input_files_at_output_level; + + stats.num_output_records = num_output_records; + stats.num_output_files = num_output_files; + + stats.total_input_bytes = + EstimatedFileSize( + num_input_records / num_input_files, + key_size, value_size, compression_ratio) * num_input_files; + stats.total_output_bytes = + EstimatedFileSize( + num_output_records / num_output_files, + key_size, value_size, compression_ratio) * num_output_files; + stats.total_input_raw_key_bytes = + num_input_records * (key_size + 8); + stats.total_input_raw_value_bytes = + num_input_records * value_size; + + stats.is_manual_compaction = is_manual; + + stats.num_records_replaced = num_records_replaced; + + CopyPrefix(smallest_key, + CompactionJobStats::kMaxPrefixLength, + &stats.smallest_output_key_prefix); + CopyPrefix(largest_key, + CompactionJobStats::kMaxPrefixLength, + &stats.largest_output_key_prefix); + + return stats; +} + +CompressionType GetAnyCompression() { + if (Snappy_Supported()) { + return kSnappyCompression; + } else if (Zlib_Supported()) { + return kZlibCompression; + } else if (BZip2_Supported()) { + return kBZip2Compression; + } else if (LZ4_Supported()) { + return kLZ4Compression; + } else if (XPRESS_Supported()) { + return kXpressCompression; + } + + return kNoCompression; +} + +} // namespace + +TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { + Random rnd(301); + const int kBufSize = 100; + char buf[kBufSize]; + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 100; + const int kTestScale = 8; + const int kKeySize = 10; + const int kValueSize = 1000; + const double kCompressionRatio = 0.5; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. The expected CompactionJobStats is added + // via AddExpectedStats(). + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + // just enough setting to hold off auto-compaction. + options.level0_file_num_compaction_trigger = kTestScale + 1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options.bytes_per_sync = 512 * 1024; + + options.report_bg_io_stats = true; + for (int test = 0; test < 2; ++test) { + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // 1st Phase: generate "num_L0_files" L0 files. + int num_L0_files = 0; + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d", ++num_L0_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + ASSERT_EQ(ToString(num_L0_files), FilesPerLevel(1)); + + // 2nd Phase: perform L0 -> L1 compaction. + int L0_compaction_count = 6; + int count = 1; + std::string smallest_key; + std::string largest_key; + for (uint64_t start_key = key_base; + start_key <= key_base * L0_compaction_count; + start_key += key_base, count++) { + smallest_key = Key(start_key, 10); + largest_key = Key(start_key + key_base - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + smallest_key, largest_key, + 1, 0, num_keys_per_L0_file, + kKeySize, kValueSize, + 1, num_keys_per_L0_file, + compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // compact two files into one in the last L0 -> L1 compaction + int num_remaining_L0 = num_L0_files - L0_compaction_count; + smallest_key = Key(key_base * (L0_compaction_count + 1), 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + smallest_key, largest_key, + num_remaining_L0, + 0, num_keys_per_L0_file * num_remaining_L0, + kKeySize, kValueSize, + 1, num_keys_per_L0_file * num_remaining_L0, + compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + + int num_L1_files = num_L0_files - num_remaining_L0 + 1; + num_L0_files = 0; + snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + + // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys) + int sparseness = 2; + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base * sparseness) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base * sparseness - 1, + kKeySize, kValueSize, + key_base * sparseness / num_keys_per_L0_file, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp + // When subcompactions are enabled, the number of output files increases + // by 1 because multiple threads are consuming the input and generating + // output files without coordinating to see if the output could fit into + // a smaller number of files like it does when it runs sequentially + int num_output_files = options.max_subcompactions > 1 ? 2 : 1; + for (uint64_t start_key = key_base; + num_L0_files > 1; + start_key += key_base * sparseness) { + smallest_key = Key(start_key, 10); + largest_key = + Key(start_key + key_base * sparseness - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + smallest_key, largest_key, + 3, 2, num_keys_per_L0_file * 3, + kKeySize, kValueSize, + num_output_files, + num_keys_per_L0_file * 2, // 1/3 of the data will be updated. + compression_ratio, + num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + if (options.max_subcompactions == 1) { + --num_L1_files; + } + snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 5th Phase: Do a full compaction, which involves in two sub-compactions. + // Here we expect to have 1 L0 files and 4 L1 files + // In the first sub-compaction, we expect L0 compaction. + smallest_key = Key(key_base, 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, + 2, 1, num_keys_per_L0_file * 3, + kKeySize, kValueSize, + 1, num_keys_per_L0_file * 2, + compression_ratio, + num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + + num_L1_files = options.max_subcompactions > 1 ? 7 : 4; + char L1_buf[4]; + snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files); + std::string L1_files(L1_buf); + ASSERT_EQ(L1_files, FilesPerLevel(1)); + options.compression = GetAnyCompression(); + if (options.compression == kNoCompression) { + break; + } + stats_checker->EnableCompression(true); + compression_ratio = kCompressionRatio; + + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)), + Slice(RandomString(&rnd, 512 * 1024, 1)))); + } + + ASSERT_OK(Flush(1)); + reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact(); + + stats_checker->set_verify_next_comp_io_stats(true); + std::atomic<bool> first_prepare_write(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) { + if (first_prepare_write.load()) { + options.env->SleepForMicroseconds(3); + first_prepare_write.store(false); + } + }); + + std::atomic<bool> first_flush(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) { + if (first_flush.load()) { + options.env->SleepForMicroseconds(3); + first_flush.store(false); + } + }); + + std::atomic<bool> first_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) { + if (first_sync.load()) { + options.env->SleepForMicroseconds(3); + first_sync.store(false); + } + }); + + std::atomic<bool> first_range_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { + if (first_range_sync.load()) { + options.env->SleepForMicroseconds(3); + first_range_sync.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Compact(1, smallest_key, largest_key); + + ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats()); + ASSERT_TRUE(!first_prepare_write.load()); + ASSERT_TRUE(!first_flush.load()); + ASSERT_TRUE(!first_sync.load()); + ASSERT_TRUE(!first_range_sync.load()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +TEST_P(CompactionJobStatsTest, DeletionStatsTest) { + Random rnd(301); + uint64_t key_base = 100000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 20; + const int kTestScale = 8; // make sure this is even + const int kKeySize = 10; + const int kValueSize = 100; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval; + uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval; + const std::string smallest_key = Key(key_base - 10, kKeySize); + const std::string largest_key = Key(largest_key_num + 10, kKeySize); + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. + auto* stats_checker = new CompactionJobDeletionStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = kTestScale+1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_bytes_for_level_multiplier = 2; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Stage 1: Generate several L0 files and then send them to L2 by + // using CompactRangeOptions and CompactRange(). These files will + // have a strict subset of the keys from the full key-range + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale / 2; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + } + + CompactRangeOptions cr_options; + cr_options.change_level = true; + cr_options.target_level = 2; + db_->CompactRange(cr_options, handles_[1], nullptr, nullptr); + ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); + + // Stage 2: Generate files including keys from the entire key range + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + } + + // Send these L0 files to L1 + TEST_Compact(0, 1, smallest_key, largest_key); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + // Add a new record and flush so now there is a L0 file + // with a value too (not just deletions from the next step) + ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test")); + ASSERT_OK(Flush(1)); + + // Stage 3: Generate L0 files with some deletions so now + // there are files with the same key range in L0, L1, and L2 + int deletion_interval = 3; + CompactionJobStats first_compaction_stats; + SelectivelyDeleteKeys(key_base, largest_key_num, + key_interval, deletion_interval, kKeySize, cutoff_key_num, + &first_compaction_stats, 1); + + stats_checker->AddExpectedStats(first_compaction_stats); + + // Stage 4: Trigger compaction and verify the stats + TEST_Compact(0, 1, smallest_key, largest_key); +} + +namespace { +int GetUniversalCompactionInputUnits(uint32_t num_flushes) { + uint32_t compaction_input_units; + for (compaction_input_units = 1; + num_flushes >= compaction_input_units; + compaction_input_units *= 2) { + if ((num_flushes & compaction_input_units) != 0) { + return compaction_input_units > 1 ? compaction_input_units : 0; + } + } + return 0; +} +} // namespace + +TEST_P(CompactionJobStatsTest, UniversalCompactionTest) { + Random rnd(301); + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_table = 100; + const uint32_t kTestScale = 6; + const int kKeySize = 10; + const int kValueSize = 900; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_table; + + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.num_levels = 3; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = num_keys_per_table * 1000; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 1; + options.compaction_options_universal.max_size_amplification_percent = 1000; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Generates the expected CompactionJobStats for each compaction + for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) { + // Here we treat one newly flushed file as an unit. + // + // For example, if a newly flushed file is 100k, and a compaction has + // 4 input units, then this compaction inputs 400k. + uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes); + if (num_input_units == 0) { + continue; + } + // The following statement determines the expected smallest key + // based on whether it is a full compaction. A full compaction only + // happens when the number of flushes equals to the number of compaction + // input runs. + uint64_t smallest_key = + (num_flushes == num_input_units) ? + key_base : key_base * (num_flushes - 1); + + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + Key(smallest_key, 10), + Key(smallest_key + key_base * num_input_units - key_interval, 10), + num_input_units, + num_input_units > 2 ? num_input_units / 2 : 0, + num_keys_per_table * num_input_units, + kKeySize, kValueSize, + num_input_units, + num_keys_per_table * num_input_units, + 1.0, 0, false)); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U); + + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest, + ::testing::Values(1, 4)); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE + +#else + +int main(int /*argc*/, char** /*argv*/) { return 0; } +#endif // !defined(IOS_CROSS_COMPILE) diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc new file mode 100644 index 000000000..e7b46ef97 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_test.cc @@ -0,0 +1,1082 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <array> +#include <cinttypes> +#include <map> +#include <string> +#include <tuple> + +#include "db/blob_index.h" +#include "db/column_family.h" +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/version_set.h" +#include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void VerifyInitializationOfCompactionJobStats( + const CompactionJobStats& compaction_job_stats) { +#if !defined(IOS_CROSS_COMPILE) + ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_records, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U); + + ASSERT_EQ(compaction_job_stats.num_output_records, 0U); + ASSERT_EQ(compaction_job_stats.num_output_files, 0U); + + ASSERT_EQ(compaction_job_stats.is_manual_compaction, true); + + ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0); + ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0); + + ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U); + ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U); + + ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U); +#endif // !defined(IOS_CROSS_COMPILE) +} + +} // namespace + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest : public testing::Test { + public: + CompactionJobTest() + : env_(Env::Default()), + fs_(std::make_shared<LegacyFileSystemWrapper>(env_)), + dbname_(test::PerThreadDBPath("compaction_job_test")), + db_options_(), + mutable_cf_options_(cf_options_), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + versions_(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)), + shutting_down_(false), + preserve_deletes_seqnum_(0), + mock_table_factory_(new mock::MockTableFactory()), + error_handler_(nullptr, db_options_, &mutex_) { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits<uint64_t>::max()); + } + + std::string GenerateFileName(uint64_t file_number) { + FileMetaData meta; + std::vector<DbPath> db_paths; + db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max()); + meta.fd = FileDescriptor(file_number, 0, 0); + return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); + } + + static std::string KeyStr(const std::string& user_key, + const SequenceNumber seq_num, const ValueType t) { + return InternalKey(user_key, seq_num, t).Encode().ToString(); + } + + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset, + uint64_t size, uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset, + size, kNoCompression); + return blob_index; + } + + static std::string BlobStrInlinedTTL(const Slice& value, + uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value); + return blob_index; + } + + void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) { + assert(contents.size() > 0); + + bool first_key = true; + std::string smallest, largest; + InternalKey smallest_key, largest_key; + SequenceNumber smallest_seqno = kMaxSequenceNumber; + SequenceNumber largest_seqno = 0; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + for (auto kv : contents) { + ParsedInternalKey key; + std::string skey; + std::string value; + std::tie(skey, value) = kv; + bool parsed = ParseInternalKey(skey, &key); + + smallest_seqno = std::min(smallest_seqno, key.sequence); + largest_seqno = std::max(largest_seqno, key.sequence); + + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) { + smallest.assign(key.user_key.data(), key.user_key.size()); + smallest_key.DecodeFrom(skey); + } + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, largest) > 0) { + largest.assign(key.user_key.data(), key.user_key.size()); + largest_key.DecodeFrom(skey); + } + + first_key = false; + + if (parsed && key.type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + continue; + } + + if (blob_index.IsInlined() || blob_index.HasTTL() || + blob_index.file_number() == kInvalidBlobFileNumber) { + continue; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } + } + + uint64_t file_number = versions_->NewFileNumber(); + EXPECT_OK(mock_table_factory_->CreateMockTable( + env_, GenerateFileName(file_number), std::move(contents))); + + VersionEdit edit; + edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key, + smallest_seqno, largest_seqno, false, oldest_blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + + mutex_.Lock(); + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_); + mutex_.Unlock(); + } + + void SetLastSequence(const SequenceNumber sequence_number) { + versions_->SetLastAllocatedSequence(sequence_number + 1); + versions_->SetLastPublishedSequence(sequence_number + 1); + versions_->SetLastSequence(sequence_number + 1); + } + + // returns expected result after compaction + stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) { + auto expected_results = mock::MakeMockFile(); + const int kKeysPerFile = 10000; + const int kCorruptKeysPerFile = 200; + const int kMatchingKeys = kKeysPerFile / 2; + SequenceNumber sequence_number = 0; + + auto corrupt_id = [&](int id) { + return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile; + }; + + for (int i = 0; i < 2; ++i) { + auto contents = mock::MakeMockFile(); + for (int k = 0; k < kKeysPerFile; ++k) { + auto key = ToString(i * kMatchingKeys + k); + auto value = ToString(i * kKeysPerFile + k); + InternalKey internal_key(key, ++sequence_number, kTypeValue); + + // This is how the key will look like once it's written in bottommost + // file + InternalKey bottommost_internal_key( + key, 0, kTypeValue); + + if (corrupt_id(k)) { + test::CorruptKeyType(&internal_key); + test::CorruptKeyType(&bottommost_internal_key); + } + contents.insert({ internal_key.Encode().ToString(), value }); + if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) { + expected_results.insert( + { bottommost_internal_key.Encode().ToString(), value }); + } + } + + AddMockFile(contents); + } + + SetLastSequence(sequence_number); + + return expected_results; + } + + void NewDB() { + DestroyDB(dbname_, Options()); + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)); + compaction_job_stats_.Reset(); + SetIdentityFile(env_, dbname_); + + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + std::unique_ptr<WritableFile> file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + ASSERT_OK(s); + std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); + { + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, nullptr); + + std::vector<ColumnFamilyDescriptor> column_families; + cf_options_.table_factory = mock_table_factory_; + cf_options_.merge_operator = merge_op_; + cf_options_.compaction_filter = compaction_filter_.get(); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + + EXPECT_OK(versions_->Recover(column_families, false)); + cfd_ = versions_->GetColumnFamilySet()->GetDefault(); + } + + void RunCompaction( + const std::vector<std::vector<FileMetaData*>>& input_files, + const stl_wrappers::KVMap& expected_results, + const std::vector<SequenceNumber>& snapshots = {}, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + int output_level = 1, bool verify = true, + uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) { + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + + size_t num_input_files = 0; + std::vector<CompactionInputFiles> compaction_input_files; + for (size_t level = 0; level < input_files.size(); level++) { + auto level_files = input_files[level]; + CompactionInputFiles compaction_level; + compaction_level.level = static_cast<int>(level); + compaction_level.files.insert(compaction_level.files.end(), + level_files.begin(), level_files.end()); + compaction_input_files.push_back(compaction_level); + num_input_files += level_files.size(); + } + + Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(), + *cfd->GetLatestMutableCFOptions(), + compaction_input_files, output_level, 1024 * 1024, + 10 * 1024 * 1024, 0, kNoCompression, + cfd->ioptions()->compression_opts, 0, {}, true); + compaction.SetInputVersion(cfd->current()); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + mutex_.Lock(); + EventLogger event_logger(db_options_.info_log.get()); + // TODO(yiwu) add a mock snapshot checker and add test for it. + SnapshotChecker* snapshot_checker = nullptr; + CompactionJob compaction_job( + 0, &compaction, db_options_, env_options_, versions_.get(), + &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr, + nullptr, nullptr, &mutex_, &error_handler_, snapshots, + earliest_write_conflict_snapshot, snapshot_checker, table_cache_, + &event_logger, false, false, dbname_, &compaction_job_stats_, + Env::Priority::USER); + VerifyInitializationOfCompactionJobStats(compaction_job_stats_); + + compaction_job.Prepare(); + mutex_.Unlock(); + Status s; + s = compaction_job.Run(); + ASSERT_OK(s); + mutex_.Lock(); + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); + mutex_.Unlock(); + + if (verify) { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + + if (expected_results.empty()) { + ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); + } else { + ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); + mock_table_factory_->AssertLatestFile(expected_results); + + auto output_files = + cfd->current()->storage_info()->LevelFiles(output_level); + ASSERT_EQ(output_files.size(), 1); + ASSERT_EQ(output_files[0]->oldest_blob_file_number, + expected_oldest_blob_file_number); + } + } + } + + Env* env_; + std::shared_ptr<FileSystem> fs_; + std::string dbname_; + EnvOptions env_options_; + ImmutableDBOptions db_options_; + ColumnFamilyOptions cf_options_; + MutableCFOptions mutable_cf_options_; + std::shared_ptr<Cache> table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + std::unique_ptr<VersionSet> versions_; + InstrumentedMutex mutex_; + std::atomic<bool> shutting_down_; + SequenceNumber preserve_deletes_seqnum_; + std::shared_ptr<mock::MockTableFactory> mock_table_factory_; + CompactionJobStats compaction_job_stats_; + ColumnFamilyData* cfd_; + std::unique_ptr<CompactionFilter> compaction_filter_; + std::shared_ptr<MergeOperator> merge_op_; + ErrorHandler error_handler_; +}; + +TEST_F(CompactionJobTest, Simple) { + NewDB(); + + auto expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto files = cfd->current()->storage_info()->LevelFiles(0); + ASSERT_EQ(2U, files.size()); + RunCompaction({ files }, expected_results); +} + +TEST_F(CompactionJobTest, SimpleCorrupted) { + NewDB(); + + auto expected_results = CreateTwoFiles(true); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto files = cfd->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); + ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U); +} + +TEST_F(CompactionJobTest, SimpleDeletion) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"}, + {KeyStr("b", 1U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}}); + + SetLastSequence(4U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, OutputNothing) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}}); + + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile(); + + SetLastSequence(4U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, SimpleOverwrite) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 3U, kTypeValue), "val2"}, + {KeyStr("b", 4U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"}, + {KeyStr("b", 0U, kTypeValue), "val3"}}); + + SetLastSequence(4U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, SimpleNonLastLevel) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + // Because level 1 is not the last level, the sequence numbers of a and b + // cannot be set to 0 + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + RunCompaction({lvl0_files, lvl1_files}, expected_results); +} + +TEST_F(CompactionJobTest, SimpleMerge) { + merge_op_ = MergeOperators::CreateStringAppendOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeValue), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, NonAssocMerge) { + merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeMerge), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +// Filters merge operands with value 10. +TEST_F(CompactionJobTest, MergeOperandFilter) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered + }); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)}, + {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}}); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, FilterSomeMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)}, + {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)}, + {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)} + // b does not appear because the operands are filtered + }); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +// Test where all operands/merge results are filtered out. +TEST_F(CompactionJobTest, FilterAllMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file3, 2); + + SetLastSequence(11U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + + stl_wrappers::KVMap empty_map; + RunCompaction({files}, empty_map); +} + +TEST_F(CompactionJobTest, SimpleSingleDelete) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeDeletion), ""}, + {KeyStr("b", 6U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("a", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, SingleDeleteSnapshots) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("d", 9U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 9U, kTypeSingleDeletion), ""}, + {KeyStr("k", 12U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("l", 3U, kTypeSingleDeletion), ""}, + {KeyStr("l", 2U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("0", 2U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), "val1"}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 21U, kTypeValue), "val3"}, + {KeyStr("d", 8U, kTypeValue), "val4"}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("h", 2U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("A", 1U, kTypeValue), "val"}, + {KeyStr("e", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("c", 21U, kTypeValue), ""}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U, 20U}, 10U); +} + +TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) { + NewDB(); + + // Test multiple snapshots where the earliest snapshot is not a + // write-conflic-snapshot. + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), "val"}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), "val"}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), "val"}, + {KeyStr("G", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 23U, kTypeValue), "val2"}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("H", 24U, kTypeSingleDeletion), ""}, + {KeyStr("H", 23U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), "val2"}, + {KeyStr("I", 33U, kTypeSingleDeletion), ""}, + {KeyStr("I", 32U, kTypeValue), "val3"}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeSingleDeletion), ""}, + {KeyStr("C", 13U, kTypeValue), "val"}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val3"}, + {KeyStr("H", 14U, kTypeSingleDeletion), ""}, + {KeyStr("H", 13U, kTypeValue), "val2"}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val5"}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), ""}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), ""}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), ""}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), ""}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + + SetLastSequence(24U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U, 20U, 30U}, 20U); +} + +TEST_F(CompactionJobTest, SingleDeleteZeroSeq) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeSingleDeletion), ""}, + {KeyStr("dummy", 5U, kTypeValue), "val2"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 0U, kTypeValue), "val"}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("dummy", 0U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {}); +} + +TEST_F(CompactionJobTest, MultiSingleDelete) { + // Tests three scenarios involving multiple single delete/put pairs: + // + // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel + // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot + // C: SDel Put SDel Snapshot Put -> Snapshot Put + // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel + // E: Put SDel Snapshot Put SDel -> Snapshot SDel + // F: Put SDel Put Sdel Snapshot -> removed + // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel + // H: (Put) Put SDel Put Sdel Snapshot -> Removed + // I: (Put) Snapshot Put SDel Put SDel -> SDel + // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put + // -> Snapshot Put + // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot Put Snapshot SDel + // L: SDel Put Del Put SDel Snapshot Del Put Del SDel Put SDel + // -> Snapshot SDel + // M: (Put) SDel Put Del Put SDel Snapshot Put Del SDel Put SDel Del + // -> SDel Snapshot Del + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val5"}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), "val4"}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val"}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val"}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + {KeyStr("J", 13U, kTypeSingleDeletion), ""}, + {KeyStr("J", 12U, kTypeValue), "val"}, + {KeyStr("J", 11U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), "val1"}, + {KeyStr("K", 14U, kTypeSingleDeletion), ""}, + {KeyStr("K", 13U, kTypeSingleDeletion), ""}, + {KeyStr("K", 12U, kTypeValue), "val2"}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), "val"}, + {KeyStr("L", 14U, kTypeSingleDeletion), ""}, + {KeyStr("L", 13U, kTypeDeletion), ""}, + {KeyStr("L", 12U, kTypeValue), "val"}, + {KeyStr("L", 11U, kTypeDeletion), ""}, + {KeyStr("M", 16U, kTypeDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), "val"}, + {KeyStr("M", 13U, kTypeSingleDeletion), ""}, + {KeyStr("M", 12U, kTypeDeletion), ""}, + {KeyStr("M", 11U, kTypeValue), "val"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 11U, kTypeValue), "val2"}, + {KeyStr("C", 10U, kTypeSingleDeletion), ""}, + {KeyStr("C", 9U, kTypeValue), "val6"}, + {KeyStr("C", 8U, kTypeSingleDeletion), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), "val"}, + {KeyStr("E", 5U, kTypeSingleDeletion), ""}, + {KeyStr("E", 4U, kTypeValue), "val"}, + {KeyStr("F", 6U, kTypeSingleDeletion), ""}, + {KeyStr("F", 5U, kTypeValue), "val"}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("H", 6U, kTypeSingleDeletion), ""}, + {KeyStr("H", 5U, kTypeValue), "val"}, + {KeyStr("H", 4U, kTypeSingleDeletion), ""}, + {KeyStr("H", 3U, kTypeValue), "val"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val"}, + {KeyStr("J", 6U, kTypeSingleDeletion), ""}, + {KeyStr("J", 5U, kTypeSingleDeletion), ""}, + {KeyStr("J", 4U, kTypeValue), "val"}, + {KeyStr("J", 3U, kTypeSingleDeletion), ""}, + {KeyStr("J", 2U, kTypeValue), "val"}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("K", 7U, kTypeValue), "val4"}, + {KeyStr("K", 6U, kTypeSingleDeletion), ""}, + {KeyStr("K", 5U, kTypeValue), "val5"}, + {KeyStr("K", 2U, kTypeSingleDeletion), ""}, + {KeyStr("K", 1U, kTypeSingleDeletion), ""}, + {KeyStr("L", 5U, kTypeSingleDeletion), ""}, + {KeyStr("L", 4U, kTypeValue), "val"}, + {KeyStr("L", 3U, kTypeDeletion), ""}, + {KeyStr("L", 2U, kTypeValue), "val"}, + {KeyStr("L", 1U, kTypeSingleDeletion), ""}, + {KeyStr("M", 10U, kTypeSingleDeletion), ""}, + {KeyStr("M", 7U, kTypeValue), "val"}, + {KeyStr("M", 5U, kTypeDeletion), ""}, + {KeyStr("M", 4U, kTypeValue), "val"}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("D", 1U, kTypeValue), "val"}, + {KeyStr("H", 1U, kTypeValue), "val"}, + {KeyStr("I", 2U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({ + {KeyStr("M", 1U, kTypeValue), "val"}, + }); + AddMockFile(file4, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), ""}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), ""}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), ""}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), ""}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), ""}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), ""}, + {KeyStr("M", 16U, kTypeDeletion), ""}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}}); + + SetLastSequence(22U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U}, 10U); +} + +// This test documents the behavior where a corrupt key follows a deletion or a +// single deletion and the (single) deletion gets removed while the corrupt key +// gets written out. TODO(noetzli): We probably want a better way to treat +// corrupt keys. +TEST_F(CompactionJobTest, CorruptionAfterDeletion) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"}, + {test::KeyStr("a", 5U, kTypeDeletion), ""}, + {test::KeyStr("a", 4U, kTypeValue, true), "val"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""}, + {test::KeyStr("b", 2U, kTypeValue, true), "val"}, + {test::KeyStr("c", 1U, kTypeValue), "val2"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"}, + {test::KeyStr("a", 0U, kTypeValue, true), "val"}, + {test::KeyStr("b", 0U, kTypeValue, true), "val"}, + {test::KeyStr("c", 0U, kTypeValue), "val2"}}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, OldestBlobFileNumber) { + NewDB(); + + // Note: blob1 is inlined TTL, so it will not be considered for the purposes + // of identifying the oldest referenced blob file. Similarly, blob6 will be + // ignored because it has TTL and hence refers to a TTL blob file. + const stl_wrappers::KVMap::value_type blob1( + KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL)); + const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex), + BlobStr(59, 123456, 999)); + const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex), + BlobStr(138, 1000, 1 << 8)); + auto file1 = mock::MakeMockFile({blob1, blob2, blob3}); + AddMockFile(file1); + + const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex), + BlobStr(199, 3 << 10, 1 << 20)); + const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex), + BlobStr(19, 6789, 333)); + const stl_wrappers::KVMap::value_type blob6( + KeyStr("f", 6U, kTypeBlobIndex), + BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL)); + auto file2 = mock::MakeMockFile({blob4, blob5, blob6}); + AddMockFile(file2); + + const stl_wrappers::KVMap::value_type expected_blob1( + KeyStr("a", 0U, kTypeBlobIndex), blob1.second); + const stl_wrappers::KVMap::value_type expected_blob2( + KeyStr("b", 0U, kTypeBlobIndex), blob2.second); + const stl_wrappers::KVMap::value_type expected_blob3( + KeyStr("c", 0U, kTypeBlobIndex), blob3.second); + const stl_wrappers::KVMap::value_type expected_blob4( + KeyStr("d", 0U, kTypeBlobIndex), blob4.second); + const stl_wrappers::KVMap::value_type expected_blob5( + KeyStr("e", 0U, kTypeBlobIndex), blob5.second); + const stl_wrappers::KVMap::value_type expected_blob6( + KeyStr("f", 0U, kTypeBlobIndex), blob6.second); + auto expected_results = + mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3, + expected_blob4, expected_blob5, expected_blob6}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, std::vector<SequenceNumber>(), + kMaxSequenceNumber, /* output_level */ 1, /* verify */ true, + /* expected_oldest_blob_file_number */ 19); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc new file mode 100644 index 000000000..4355d4b91 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.cc @@ -0,0 +1,1131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker.h" + +#include <cinttypes> +#include <limits> +#include <queue> +#include <string> +#include <utility> +#include <vector> +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->compensated_file_size; + } + return sum; +} +} // anonymous namespace + +bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno) { + // Do not pick ingested file when there is at least one memtable not flushed + // which of seqno is overlap with the sst. + TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; + for (; start < level_files.size(); start++) { + if (level_files[start]->being_compacted) { + return false; + } + // If there is no data in memtable, the earliest sequence number would the + // largest sequence number in last memtable. + // Because all files are sorted in descending order by largest_seqno, so we + // only need to check the first one. + if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { + break; + } + } + if (start >= level_files.size()) { + return false; + } + size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size); + uint64_t compensated_compact_bytes = + level_files[start]->compensated_file_size; + size_t compact_bytes_per_del_file = port::kMaxSizet; + // Compaction range will be [start, limit). + size_t limit; + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. + size_t new_compact_bytes_per_del_file = 0; + for (limit = start + 1; limit < level_files.size(); ++limit) { + compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size); + compensated_compact_bytes += level_files[limit]->compensated_file_size; + new_compact_bytes_per_del_file = compact_bytes / (limit - start); + if (level_files[limit]->being_compacted || + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compensated_compact_bytes > max_compaction_bytes) { + break; + } + compact_bytes_per_del_file = new_compact_bytes_per_del_file; + } + + if ((limit - start) >= min_files_to_compact && + compact_bytes_per_del_file < max_compact_bytes_per_del_file) { + assert(comp_inputs != nullptr); + comp_inputs->level = 0; + for (size_t i = start; i < limit; ++i) { + comp_inputs->files.push_back(level_files[i]); + } + return true; + } + return false; +} + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use it. + if (ioptions.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1)) { + return ioptions.bottommost_compression; + } + // If the user has specified a different compression level for each level, + // then pick the compression for that level. + if (!ioptions.compression_per_level.empty()) { + assert(level == 0 || level >= base_level); + int idx = (level == 0) ? 0 : level - base_level + 1; + + const int n = static_cast<int>(ioptions.compression_per_level.size()) - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level is beyond the end of the + // specified compression levels, use the last value. + return ioptions.compression_per_level[std::max(0, std::min(idx, n))]; + } else { + return mutable_cf_options.compression; + } +} + +CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression) { + if (!enable_compression) { + return ioptions.compression_opts; + } + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use the specified compression options + // for the bottmomost_compression. + if (ioptions.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1) && + ioptions.bottommost_compression_opts.enabled) { + return ioptions.bottommost_compression_opts; + } + return ioptions.compression_opts; +} + +CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : ioptions_(ioptions), icmp_(icmp) {} + +CompactionPicker::~CompactionPicker() {} + +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + UnregisterCompaction(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs, + InternalKey* smallest, + InternalKey* largest) const { + const int level = inputs.level; + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + + if (level == 0) { + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } + } else { + *smallest = inputs[0]->smallest; + *largest = inputs[inputs.size() - 1]->largest; + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, + InternalKey* smallest, + InternalKey* largest) const { + assert(!inputs1.empty() || !inputs2.empty()); + if (inputs1.empty()) { + GetRange(inputs2, smallest, largest); + } else if (inputs2.empty()) { + GetRange(inputs1, smallest, largest); + } else { + InternalKey smallest1, smallest2, largest1, largest2; + GetRange(inputs1, &smallest1, &largest1); + GetRange(inputs2, &smallest2, &largest2); + *smallest = + icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2; + *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1; + } +} + +void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs, + InternalKey* smallest, + InternalKey* largest) const { + InternalKey current_smallest; + InternalKey current_largest; + bool initialized = false; + for (const auto& in : inputs) { + if (in.empty()) { + continue; + } + GetRange(in, ¤t_smallest, ¤t_largest); + if (!initialized) { + *smallest = current_smallest; + *largest = current_largest; + initialized = true; + } else { + if (icmp_->Compare(current_smallest, *smallest) < 0) { + *smallest = current_smallest; + } + if (icmp_->Compare(current_largest, *largest) > 0) { + *largest = current_largest; + } + } + } + assert(initialized); +} + +bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest) { + // This isn't good compaction + assert(!inputs->empty()); + + const int level = inputs->level; + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (level == 0) { + return true; + } + + InternalKey smallest, largest; + + // Keep expanding inputs until we are sure that there is a "clean cut" + // boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = inputs->size(); + GetRange(*inputs, &smallest, &largest); + inputs->clear(); + vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, + hint_index, &hint_index, true, + next_smallest); + } while (inputs->size() > old_size); + + // we started off with inputs non-empty and the previous loop only grew + // inputs. thus, inputs should be non-empty here + assert(!inputs->empty()); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + if (AreFilesInCompaction(inputs->files)) { + return false; + } + return true; +} + +bool CompactionPicker::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + const Comparator* ucmp = icmp_->user_comparator(); + for (Compaction* c : compactions_in_progress_) { + if (c->output_level() == level && + ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) <= 0 && + ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) >= 0) { + // Overlap + return true; + } + } + // Did not overlap with any running compaction in level `level` + return false; +} + +bool CompactionPicker::FilesRangeOverlapWithCompaction( + const std::vector<CompactionInputFiles>& inputs, int level) const { + bool is_empty = true; + for (auto& in : inputs) { + if (!in.empty()) { + is_empty = false; + break; + } + } + if (is_empty) { + // No files in inputs + return false; + } + + InternalKey smallest, largest; + GetRange(inputs, &smallest, &largest); + return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + level); +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::AreFilesInCompaction( + const std::vector<FileMetaData*>& files) { + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +Compaction* CompactionPicker::CompactFiles( + const CompactionOptions& compact_options, + const std::vector<CompactionInputFiles>& input_files, int output_level, + VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + uint32_t output_path_id) { + assert(input_files.size()); + // This compaction output should not overlap with a running compaction as + // `SanitizeCompactionInputFiles` should've checked earlier and db mutex + // shouldn't have been released since. + assert(!FilesRangeOverlapWithCompaction(input_files, output_level)); + + CompressionType compression_type; + if (compact_options.compression == kDisableCompressionOption) { + int base_level; + if (ioptions_.compaction_style == kCompactionStyleLevel) { + base_level = vstorage->base_level(); + } else { + base_level = 1; + } + compression_type = + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + output_level, base_level); + } else { + // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType` + // without configurable `CompressionOptions`, which is inconsistent. + compression_type = compact_options.compression; + } + auto c = new Compaction( + vstorage, ioptions_, mutable_cf_options, input_files, output_level, + compact_options.output_file_size_limit, + mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_options.max_subcompactions, + /* grandparents */ {}, true); + RegisterCompaction(c); + return c; +} + +Status CompactionPicker::GetCompactionInputsFromFileNumbers( + std::vector<CompactionInputFiles>* input_files, + std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage, + const CompactionOptions& /*compact_options*/) const { + if (input_set->size() == 0U) { + return Status::InvalidArgument( + "Compaction must include at least one file."); + } + assert(input_files); + + std::vector<CompactionInputFiles> matched_input_files; + matched_input_files.resize(vstorage->num_levels()); + int first_non_empty_level = -1; + int last_non_empty_level = -1; + // TODO(yhchiang): use a lazy-initialized mapping from + // file_number to FileMetaData in Version. + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (auto file : vstorage->LevelFiles(level)) { + auto iter = input_set->find(file->fd.GetNumber()); + if (iter != input_set->end()) { + matched_input_files[level].files.push_back(file); + input_set->erase(iter); + last_non_empty_level = level; + if (first_non_empty_level == -1) { + first_non_empty_level = level; + } + } + } + } + + if (!input_set->empty()) { + std::string message( + "Cannot find matched SST files for the following file numbers:"); + for (auto fn : *input_set) { + message += " "; + message += ToString(fn); + } + return Status::InvalidArgument(message); + } + + for (int level = first_non_empty_level; level <= last_non_empty_level; + ++level) { + matched_input_files[level].level = level; + input_files->emplace_back(std::move(matched_input_files[level])); + } + + return Status::OK(); +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* level_index) { + std::vector<FileMetaData*> inputs; + assert(level < NumberLevels()); + + vstorage->GetOverlappingInputs(level, smallest, largest, &inputs, + level_index ? *level_index : 0, level_index); + return AreFilesInCompaction(inputs); +} + +// Populates the set of inputs of all other levels that overlap with the +// start level. +// Now we assume all levels except start level and output level are empty. +// Will also attempt to expand "start level" if that doesn't expand +// "output level" or cause "level" to include a file for compaction that has an +// overlapping user-key with another file. +// REQUIRES: input_level and output_level are different +// REQUIRES: inputs->empty() == false +// Returns false if files on parent level are currently in compaction, which +// means that we can't compact them +bool CompactionPicker::SetupOtherInputs( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, int* parent_index, + int base_index) { + assert(!inputs->empty()); + assert(output_level_inputs->empty()); + const int input_level = inputs->level; + const int output_level = output_level_inputs->level; + if (input_level == output_level) { + // no possibility of conflict + return true; + } + + // For now, we only support merging two levels, start level and output level. + // We need to assert other levels are empty. + for (int l = input_level + 1; l < output_level; l++) { + assert(vstorage->NumLevelFiles(l) == 0); + } + + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(*inputs, &smallest, &largest); + + // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to + // include in compaction + vstorage->GetOverlappingInputs(output_level, &smallest, &largest, + &output_level_inputs->files, *parent_index, + parent_index); + if (AreFilesInCompaction(output_level_inputs->files)) { + return false; + } + if (!output_level_inputs->empty()) { + if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) { + return false; + } + } + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!output_level_inputs->empty()) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + const uint64_t output_level_inputs_size = + TotalCompensatedFileSize(output_level_inputs->files); + const uint64_t inputs_size = TotalCompensatedFileSize(inputs->files); + bool expand_inputs = false; + + CompactionInputFiles expanded_inputs; + expanded_inputs.level = input_level; + // Get closed interval of output level + InternalKey all_start, all_limit; + GetRange(*inputs, *output_level_inputs, &all_start, &all_limit); + bool try_overlapping_inputs = true; + vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit, + &expanded_inputs.files, base_index, nullptr); + uint64_t expanded_inputs_size = + TotalCompensatedFileSize(expanded_inputs.files); + if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) { + try_overlapping_inputs = false; + } + if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && + output_level_inputs_size + expanded_inputs_size < limit && + !AreFilesInCompaction(expanded_inputs.files)) { + InternalKey new_start, new_limit; + GetRange(expanded_inputs, &new_start, &new_limit); + CompactionInputFiles expanded_output_level_inputs; + expanded_output_level_inputs.level = output_level; + vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit, + &expanded_output_level_inputs.files, + *parent_index, parent_index); + assert(!expanded_output_level_inputs.empty()); + if (!AreFilesInCompaction(expanded_output_level_inputs.files) && + ExpandInputsToCleanCut(cf_name, vstorage, + &expanded_output_level_inputs) && + expanded_output_level_inputs.size() == output_level_inputs->size()) { + expand_inputs = true; + } + } + if (!expand_inputs) { + vstorage->GetCleanInputsWithinInterval(input_level, &all_start, + &all_limit, &expanded_inputs.files, + base_index, nullptr); + expanded_inputs_size = TotalCompensatedFileSize(expanded_inputs.files); + if (expanded_inputs.size() > inputs->size() && + output_level_inputs_size + expanded_inputs_size < limit && + !AreFilesInCompaction(expanded_inputs.files)) { + expand_inputs = true; + } + } + if (expand_inputs) { + ROCKS_LOG_INFO(ioptions_.info_log, + "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt + "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt + "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", + cf_name.c_str(), input_level, inputs->size(), + output_level_inputs->size(), inputs_size, + output_level_inputs_size, expanded_inputs.size(), + output_level_inputs->size(), expanded_inputs_size, + output_level_inputs_size); + inputs->files = expanded_inputs.files; + } + } + return true; +} + +void CompactionPicker::GetGrandparents( + VersionStorageInfo* vstorage, const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector<FileMetaData*>* grandparents) { + InternalKey start, limit; + GetRange(inputs, output_level_inputs, &start, &limit); + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (output_level_inputs.level + 1 < NumberLevels()) { + vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start, + &limit, grandparents); + } +} + +Compaction* CompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore) { + // CompactionPickerFIFO has its own implementation of compact range + assert(ioptions_.compaction_style != kCompactionStyleFIFO); + + if (input_level == ColumnFamilyData::kCompactAllLevels) { + assert(ioptions_.compaction_style == kCompactionStyleUniversal); + + // Universal compaction with more than one level always compacts all the + // files together to the last level. + assert(vstorage->num_levels() > 1); + // DBImpl::CompactRange() set output level to be the last level + if (ioptions_.allow_ingest_behind) { + assert(output_level == vstorage->num_levels() - 2); + } else { + assert(output_level == vstorage->num_levels() - 1); + } + // DBImpl::RunManualCompaction will make full range for universal compaction + assert(begin == nullptr); + assert(end == nullptr); + *compaction_end = nullptr; + + int start_level = 0; + for (; start_level < vstorage->num_levels() && + vstorage->NumLevelFiles(start_level) == 0; + start_level++) { + } + if (start_level == vstorage->num_levels()) { + return nullptr; + } + + if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) { + *manual_conflict = true; + // Only one level 0 compaction allowed + return nullptr; + } + + std::vector<CompactionInputFiles> inputs(vstorage->num_levels() - + start_level); + for (int level = start_level; level < vstorage->num_levels(); level++) { + inputs[level - start_level].level = level; + auto& files = inputs[level - start_level].files; + for (FileMetaData* f : vstorage->LevelFiles(level)) { + files.push_back(f); + } + if (AreFilesInCompaction(files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction(inputs, output_level)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style), + /* max_compaction_bytes */ LLONG_MAX, + compact_range_options.target_path_id, + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + output_level, 1), + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_range_options.max_subcompactions, /* grandparents */ {}, + /* is manual */ true); + RegisterCompaction(c); + return c; + } + + CompactionInputFiles inputs; + inputs.level = input_level; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + + vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files); + if (inputs.empty()) { + return nullptr; + } + + if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) { + // Only one level 0 compaction allowed + TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict"); + *manual_conflict = true; + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + uint64_t total = 0; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + uint64_t s = inputs[i]->compensated_file_size; + total += s; + if (total >= limit) { + covering_the_whole_range = false; + inputs.files.resize(i + 1); + break; + } + } + } + assert(compact_range_options.target_path_id < + static_cast<uint32_t>(ioptions_.cf_paths.size())); + + // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out + // files that are created during the current compaction. + if (compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized && + max_file_num_to_ignore != port::kMaxUint64) { + assert(input_level == output_level); + // inputs_shrunk holds a continuous subset of input files which were all + // created before the current manual compaction + std::vector<FileMetaData*> inputs_shrunk; + size_t skip_input_index = inputs.size(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + inputs_shrunk.push_back(inputs[i]); + } else if (!inputs_shrunk.empty()) { + // inputs[i] was created during the current manual compaction and + // need to be skipped + skip_input_index = i; + break; + } + } + if (inputs_shrunk.empty()) { + return nullptr; + } + if (inputs.size() != inputs_shrunk.size()) { + inputs.files.swap(inputs_shrunk); + } + // set covering_the_whole_range to false if there is any file that need to + // be compacted in the range of inputs[skip_input_index+1, inputs.size()) + for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + covering_the_whole_range = false; + } + } + } + + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { + // manual compaction is now multi-threaded, so it can + // happen that ExpandWhileOverlapping fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + + if (covering_the_whole_range || !next_smallest) { + *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; + } + + CompactionInputFiles output_level_inputs; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(input_level == 0); + output_level = vstorage->base_level(); + assert(output_level > 0); + } + output_level_inputs.level = output_level; + if (input_level != output_level) { + int parent_index = -1; + if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, + &output_level_inputs, &parent_index, -1)) { + // manual compaction is now multi-threaded, so it can + // happen that SetupOtherInputs fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + } + + std::vector<CompactionInputFiles> compaction_inputs({inputs}); + if (!output_level_inputs.empty()) { + compaction_inputs.push_back(output_level_inputs); + } + for (size_t i = 0; i < compaction_inputs.size(); i++) { + if (AreFilesInCompaction(compaction_inputs[i].files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction(compaction_inputs, output_level)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + std::vector<FileMetaData*> grandparents; + GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); + Compaction* compaction = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style, vstorage->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options.max_compaction_bytes, + compact_range_options.target_path_id, + GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, + vstorage->base_level()), + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_range_options.max_subcompactions, std::move(grandparents), + /* is manual compaction */ true); + + TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); + RegisterCompaction(compaction); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + + return compaction; +} + +#ifndef ROCKSDB_LITE +namespace { +// Test whether two files have overlapping key-ranges. +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { + if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->Compare(a.largestkey, b.largestkey) <= 0) { + if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} +} // namespace + +Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( + std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + auto& levels = cf_meta.levels; + auto comparator = icmp_->user_comparator(); + + // TODO(yhchiang): add is_adjustable to CompactionOptions + + // the smallest and largest key of the current compaction input + std::string smallestkey; + std::string largestkey; + // a flag for initializing smallest and largest key + bool is_first = false; + const int kNotFound = -1; + + // For each level, it does the following things: + // 1. Find the first and the last compaction input files + // in the current level. + // 2. Include all files between the first and the last + // compaction input files. + // 3. Update the compaction key-range. + // 4. For all remaining levels, include files that have + // overlapping key-range with the compaction key-range. + for (int l = 0; l <= output_level; ++l) { + auto& current_files = levels[l].files; + int first_included = static_cast<int>(current_files.size()); + int last_included = kNotFound; + + // identify the first and the last compaction input files + // in the current level. + for (size_t f = 0; f < current_files.size(); ++f) { + if (input_files->find(TableFileNameToNumber(current_files[f].name)) != + input_files->end()) { + first_included = std::min(first_included, static_cast<int>(f)); + last_included = std::max(last_included, static_cast<int>(f)); + if (is_first == false) { + smallestkey = current_files[f].smallestkey; + largestkey = current_files[f].largestkey; + is_first = true; + } + } + } + if (last_included == kNotFound) { + continue; + } + + if (l != 0) { + // expend the compaction input of the current level if it + // has overlapping key-range with other non-compaction input + // files in the same level. + while (first_included > 0) { + if (comparator->Compare(current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < + 0) { + break; + } + first_included--; + } + + while (last_included < static_cast<int>(current_files.size()) - 1) { + if (comparator->Compare(current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { + break; + } + last_included++; + } + } else if (output_level > 0) { + last_included = static_cast<int>(current_files.size() - 1); + } + + // include all files between the first and the last compaction input files. + for (int f = first_included; f <= last_included; ++f) { + if (current_files[f].being_compacted) { + return Status::Aborted("Necessary compaction input file " + + current_files[f].name + + " is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(current_files[f].name)); + } + + // update smallest and largest key + if (l == 0) { + for (int f = first_included; f <= last_included; ++f) { + if (comparator->Compare(smallestkey, current_files[f].smallestkey) > + 0) { + smallestkey = current_files[f].smallestkey; + } + if (comparator->Compare(largestkey, current_files[f].largestkey) < 0) { + largestkey = current_files[f].largestkey; + } + } + } else { + if (comparator->Compare(smallestkey, + current_files[first_included].smallestkey) > 0) { + smallestkey = current_files[first_included].smallestkey; + } + if (comparator->Compare(largestkey, + current_files[last_included].largestkey) < 0) { + largestkey = current_files[last_included].largestkey; + } + } + + SstFileMetaData aggregated_file_meta; + aggregated_file_meta.smallestkey = smallestkey; + aggregated_file_meta.largestkey = largestkey; + + // For all lower levels, include all overlapping files. + // We need to add overlapping files from the current level too because even + // if there no input_files in level l, we would still need to add files + // which overlap with the range containing the input_files in levels 0 to l + // Level 0 doesn't need to be handled this way because files are sorted by + // time and not by key + for (int m = std::max(l, 1); m <= output_level; ++m) { + for (auto& next_lv_file : levels[m].files) { + if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta, + next_lv_file)) { + if (next_lv_file.being_compacted) { + return Status::Aborted( + "File " + next_lv_file.name + + " that has overlapping key range with one of the compaction " + " input file is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(next_lv_file.name)); + } + } + } + } + if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) { + return Status::Aborted( + "A running compaction is writing to the same output level in an " + "overlapping key range"); + } + return Status::OK(); +} + +Status CompactionPicker::SanitizeCompactionInputFiles( + std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + assert(static_cast<int>(cf_meta.levels.size()) - 1 == + cf_meta.levels[cf_meta.levels.size() - 1].level); + if (output_level >= static_cast<int>(cf_meta.levels.size())) { + return Status::InvalidArgument( + "Output level for column family " + cf_meta.name + + " must between [0, " + + ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + "]."); + } + + if (output_level > MaxOutputLevel()) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm --- " + + ToString(MaxOutputLevel())); + } + + if (output_level < 0) { + return Status::InvalidArgument("Output level cannot be negative."); + } + + if (input_files->size() == 0) { + return Status::InvalidArgument( + "A compaction must contain at least one file."); + } + + Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta, + output_level); + + if (!s.ok()) { + return s; + } + + // for all input files, check whether the file number matches + // any currently-existing files. + for (auto file_num : *input_files) { + bool found = false; + for (const auto& level_meta : cf_meta.levels) { + for (const auto& file_meta : level_meta.files) { + if (file_num == TableFileNameToNumber(file_meta.name)) { + if (file_meta.being_compacted) { + return Status::Aborted("Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); + } + found = true; + break; + } + } + if (found) { + break; + } + } + if (!found) { + return Status::InvalidArgument( + "Specified compaction input file " + MakeTableFileName("", file_num) + + " does not exist in column family " + cf_meta.name + "."); + } + } + + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +void CompactionPicker::RegisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + assert(ioptions_.compaction_style != kCompactionStyleLevel || + c->output_level() == 0 || + !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level())); + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.insert(c); + } + compactions_in_progress_.insert(c); +} + +void CompactionPicker::UnregisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.erase(c); + } + compactions_in_progress_.erase(c); +} + +void CompactionPicker::PickFilesMarkedForCompaction( + const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, + int* output_level, CompactionInputFiles* start_level_inputs) { + if (vstorage->FilesMarkedForCompaction().empty()) { + return; + } + + auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + *start_level = level_file.first; + *output_level = + (*start_level == 0) ? vstorage->base_level() : *start_level + 1; + + if (*start_level == 0 && !level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs->files = {level_file.second}; + start_level_inputs->level = *start_level; + return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs); + }; + + // take a chance on a random file first + Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage)); + size_t random_file_index = static_cast<size_t>(rnd.Uniform( + static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size()))); + + if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) { + // found the compaction! + return; + } + + for (auto& level_file : vstorage->FilesMarkedForCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + start_level_inputs->files.clear(); +} + +bool CompactionPicker::GetOverlappingL0Files( + VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index) { + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + assert(level0_compactions_in_progress()->empty()); + InternalKey smallest, largest; + GetRange(*start_level_inputs, &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + start_level_inputs->files.clear(); + vstorage->GetOverlappingInputs(0, &smallest, &largest, + &(start_level_inputs->files)); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(*start_level_inputs, &smallest, &largest); + if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level, + parent_index)) { + return false; + } + assert(!start_level_inputs->files.empty()); + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h new file mode 100644 index 000000000..36d570e68 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.h @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <memory> +#include <set> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/compaction/compaction.h" +#include "db/version_set.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + +class LogBuffer; +class Compaction; +class VersionStorageInfo; +struct CompactionInputFiles; + +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. +class CompactionPicker { + public: + CompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore); + + // The maximum allowed output level. Default value is NumberLevels() - 1. + virtual int MaxOutputLevel() const { return NumberLevels() - 1; } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; + +// Sanitize the input set of compaction input files. +// When the input parameters do not describe a valid compaction, the +// function will try to fix the input_files by adding necessary +// files. If it's not possible to conver an invalid input_files +// into a valid one by adding more files, the function will return a +// non-ok status with specific reason. +#ifndef ROCKSDB_LITE + Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; +#endif // ROCKSDB_LITE + + // Free up the files that participated in a compaction + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Returns true if any one of the specified files are being compacted + bool AreFilesInCompaction(const std::vector<FileMetaData*>& files); + + // Takes a list of CompactionInputFiles and returns a (manual) Compaction + // object. + // + // Caller must provide a set of input files that has been passed through + // `SanitizeCompactionInputFiles` earlier. The lock should not be released + // between that call and this one. + Compaction* CompactFiles(const CompactionOptions& compact_options, + const std::vector<CompactionInputFiles>& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + uint32_t output_path_id); + + // Converts a set of compaction input file numbers into + // a list of CompactionInputFiles. + Status GetCompactionInputsFromFileNumbers( + std::vector<CompactionInputFiles>* input_files, + std::unordered_set<uint64_t>* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const; + + // Is there currently a compaction involving level 0 taking place + bool IsLevel0CompactionInProgress() const { + return !level0_compactions_in_progress_.empty(); + } + + // Return true if the passed key range overlap with a compaction output + // that is currently running. + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs + // in *smallest, *largest. + // REQUIRES: inputs is not empty (at least on entry have one file) + void GetRange(const std::vector<CompactionInputFiles>& inputs, + InternalKey* smallest, InternalKey* largest) const; + + int NumberLevels() const { return ioptions_.num_levels; } + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandInputsToCleanCut(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); + + // Returns true if any one of the parent files are being compacted + bool IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, int level, int* index); + + // Returns true if the key range that `inputs` files cover overlap with the + // key range of a currently running compaction. + bool FilesRangeOverlapWithCompaction( + const std::vector<CompactionInputFiles>& inputs, int level) const; + + bool SetupOtherInputs(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, + int* parent_index, int base_index); + + void GetGrandparents(VersionStorageInfo* vstorage, + const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector<FileMetaData*>* grandparents); + + void PickFilesMarkedForCompaction(const std::string& cf_name, + VersionStorageInfo* vstorage, + int* start_level, int* output_level, + CompactionInputFiles* start_level_inputs); + + bool GetOverlappingL0Files(VersionStorageInfo* vstorage, + CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index); + + // Register this compaction in the set of running compactions + void RegisterCompaction(Compaction* c); + + // Remove this compaction from the set of running compactions + void UnregisterCompaction(Compaction* c); + + std::set<Compaction*>* level0_compactions_in_progress() { + return &level0_compactions_in_progress_; + } + std::unordered_set<Compaction*>* compactions_in_progress() { + return &compactions_in_progress_; + } + + protected: + const ImmutableCFOptions& ioptions_; + +// A helper function to SanitizeCompactionInputFiles() that +// sanitizes "input_files" by adding necessary files. +#ifndef ROCKSDB_LITE + virtual Status SanitizeCompactionInputFilesForAllLevels( + std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const; +#endif // ROCKSDB_LITE + + // Keeps track of all compactions that are running on Level0. + // Protected by DB mutex + std::set<Compaction*> level0_compactions_in_progress_; + + // Keeps track of all compactions that are running. + // Protected by DB mutex + std::unordered_set<Compaction*> compactions_in_progress_; + + const InternalKeyComparator* const icmp_; +}; + +#ifndef ROCKSDB_LITE +// A dummy compaction that never triggers any automatic +// compaction. +class NullCompactionPicker : public CompactionPicker { + public: + NullCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual ~NullCompactionPicker() {} + + // Always return "nullptr" + Compaction* PickCompaction( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, + SequenceNumber /* earliest_memtable_seqno */) override { + return nullptr; + } + + // Always return "nullptr" + Compaction* CompactRange(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, + int /*input_level*/, int /*output_level*/, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, + const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, + bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/) override { + return nullptr; + } + + // Always returns false. + virtual bool NeedsCompaction( + const VersionStorageInfo* /*vstorage*/) const override { + return false; + } +}; +#endif // !ROCKSDB_LITE + +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. +bool FindIntraL0Compaction( + const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); + +CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression = true); + +CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression = true); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc new file mode 100644 index 000000000..b148aadc2 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_fifo.h" +#ifndef ROCKSDB_LITE + +#include <cinttypes> +#include <string> +#include <vector> +#include "db/column_family.h" +#include "logging/log_buffer.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) { + uint64_t total_size = 0; + for (const auto& f : files) { + total_size += f->fd.file_size; + } + return total_size; +} +} // anonymous namespace + +bool FIFOCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + +Compaction* FIFOCompactionPicker::PickTTLCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + assert(mutable_cf_options.ttl > 0); + + const int kLevel0 = 0; + const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + int64_t _current_time; + auto status = ioptions_.env->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on TTL. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast<uint64_t>(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector<CompactionInputFiles> inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.ttl) { + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + uint64_t creation_time = + f->fd.table_reader->GetTableProperties()->creation_time; + if (creation_time == 0 || + creation_time >= (current_time - mutable_cf_options.ttl)) { + break; + } + total_size -= f->compensated_file_size; + inputs[0].files.push_back(f); + } + } + } + + // Return a nullptr and proceed to size-based FIFO compaction if: + // 1. there are no files older than ttl OR + // 2. there are a few files older than ttl, but deleting them will not bring + // the total size to be less than max_table_files_size threshold. + if (inputs[0].files.empty() || + total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + return nullptr; + } + + for (const auto& f : inputs[0].files) { + uint64_t creation_time = 0; + if (f && f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + creation_time = f->fd.table_reader->GetTableProperties()->creation_time; + } + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with creation time %" PRIu64 " for deletion", + cf_name.c_str(), f->fd.GetNumber(), creation_time); + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, + kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0, + {}, /* is manual */ false, vstorage->CompactionScore(0), + /* is deletion compaction */ true, CompactionReason::kFIFOTtl); + return c; +} + +Compaction* FIFOCompactionPicker::PickSizeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + const int kLevel0 = 0; + const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size || + level_files.size() == 0) { + // total size not exceeded + if (mutable_cf_options.compaction_options_fifo.allow_compaction && + level_files.size() > 0) { + CompactionInputFiles comp_inputs; + // try to prevent same files from being compacted multiple times, which + // could produce large files that may never TTL-expire. Achieve this by + // disallowing compactions with files larger than memtable (inflate its + // size by 10% to account for uncompressed L0 files that may have size + // slightly greater than memtable size limit). + size_t max_compact_bytes_per_del_file = + static_cast<size_t>(MultiplyCheckOverflow( + static_cast<uint64_t>(mutable_cf_options.write_buffer_size), + 1.1)); + if (FindIntraL0Compaction( + level_files, + mutable_cf_options + .level0_file_num_compaction_trigger /* min_files_to_compact */ + , + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0, + 16 * 1024 * 1024 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, + 0 /* output path ID */, mutable_cf_options.compression, + ioptions_.compression_opts, 0 /* max_subcompactions */, {}, + /* is manual */ false, vstorage->CompactionScore(0), + /* is deletion compaction */ false, + CompactionReason::kFIFOReduceNumFiles); + return c; + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; + } + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector<CompactionInputFiles> inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + auto f = *ritr; + total_size -= f->compensated_file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, + kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0, + {}, /* is manual */ false, vstorage->CompactionScore(0), + /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber /*earliest_memtable_seqno*/) { + assert(vstorage->num_levels() == 1); + + Compaction* c = nullptr; + if (mutable_cf_options.ttl > 0) { + c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer); + } + if (c == nullptr) { + c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer); + } + RegisterCompaction(c); + return c; +} + +Compaction* FIFOCompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, const InternalKey* /*end*/, + InternalKey** compaction_end, bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/) { +#ifdef NDEBUG + (void)input_level; + (void)output_level; +#endif + assert(input_level == 0); + assert(output_level == 0); + *compaction_end = nullptr; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log); + Compaction* c = + PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer); + log_buffer.FlushBufferToLog(); + return c; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h new file mode 100644 index 000000000..eb786e5ac --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class FIFOCompactionPicker : public CompactionPicker { + public: + FIFOCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore) override; + + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { return 0; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; + + private: + Compaction* PickTTLCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickSizeCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc new file mode 100644 index 000000000..012edd080 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.cc @@ -0,0 +1,558 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <string> +#include <utility> +#include <vector> + +#include "db/compaction/compaction_picker_level.h" +#include "logging/log_buffer.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + SequenceNumber earliest_mem_seqno, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& ioptions) + : cf_name_(cf_name), + vstorage_(vstorage), + earliest_mem_seqno_(earliest_mem_seqno), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // For the specfied level, pick a file that we want to compact. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + void PickExpiredTtlFiles(); + + void PickFilesMarkedForPeriodicCompaction(); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + SequenceNumber earliest_mem_seqno_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + CompactionInputFiles start_level_inputs_; + std::vector<CompactionInputFiles> compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector<FileMetaData*> grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableCFOptions& ioptions_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickExpiredTtlFiles() { + if (vstorage_->ExpiredTtlFiles().empty()) { + return; + } + + auto continuation = [&](std::pair<int, FileMetaData*> level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + + if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->ExpiredTtlFiles()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { + if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + return; + } + + auto continuation = [&](std::pair<int, FileMetaData*> level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + output_level_ = start_level_ = level_file.first; + + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + if (PickFileToCompact()) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + if (start_level_inputs_.empty()) { + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, + &start_level_inputs_); + if (!start_level_inputs_.empty()) { + is_manual_ = true; + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + } + + // Bottommost Files Compaction on deleting tombstones + if (start_level_inputs_.empty()) { + size_t i; + for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); + ++i) { + auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; + assert(!level_and_file.second->being_compacted); + start_level_inputs_.level = output_level_ = start_level_ = + level_and_file.first; + start_level_inputs_.files = {level_and_file.second}; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + break; + } + } + if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { + start_level_inputs_.clear(); + } else { + assert(!start_level_inputs_.empty()); + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + } + + // TTL Compaction + if (start_level_inputs_.empty()) { + PickExpiredTtlFiles(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + } + + // Periodic Compaction + if (start_level_inputs_.empty()) { + PickFilesMarkedForPeriodicCompaction(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + if (!compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_, + output_level_)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), + output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level_, vstorage_->base_level()), + GetCompressionOptions(ioptions_, vstorage_, output_level_), + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + start_level_score_, false /* deletion_compaction */, compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/master/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast<uint64_t>( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast<uint64_t>( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + + assert(start_level_ >= 0); + + // Pick the largest file in this level that is not already + // being compacted + const std::vector<int>& file_size = + vstorage_->FilesByCompactionPri(start_level_); + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_size.size(); cmp_idx++) { + int index = file_size[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + start_level_inputs_.files.push_back(f); + start_level_inputs_.level = start_level_; + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_)) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + continue; + } + + // Now that input level is fully expanded, we check whether any output files + // are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + continue; + } + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast<size_t>( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, + port::kMaxUint64, + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_, earliest_mem_seqno_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_mem_seqno) { + LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, + log_buffer, mutable_cf_options, ioptions_); + return builder.PickCompaction(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h new file mode 100644 index 000000000..b82070e14 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc new file mode 100644 index 000000000..278bdb06a --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_test.cc @@ -0,0 +1,1741 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include <limits> +#include <string> +#include <utility> +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" + +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CountingLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; } + size_t log_count; +}; + +class CompactionPickerTest : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + LevelCompactionPicker level_compaction_picker; + std::string cf_name_; + CountingLogger logger_; + LogBuffer log_buffer_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::unique_ptr<VersionStorageInfo> vstorage_; + std::vector<std::unique_ptr<FileMetaData>> files_; + // does not own FileMetaData + std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_; + // input files to compaction process. + std::vector<CompactionInputFiles> input_files_; + int compaction_level_start_; + + CompactionPickerTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + ioptions_(options_), + mutable_cf_options_(options_), + level_compaction_picker(ioptions_, &icmp_), + cf_name_("dummy"), + log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), + file_num_(1), + vstorage_(nullptr) { + mutable_cf_options_.ttl = 0; + mutable_cf_options_.periodic_compaction_seconds = 0; + // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of + // tests to cover. + ioptions_.compaction_pri = kByCompensatedSize; + fifo_options_.max_table_files_size = 1; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + ioptions_.cf_paths.emplace_back("dummy", + std::numeric_limits<uint64_t>::max()); + } + + ~CompactionPickerTest() override {} + + void NewVersionStorage(int num_levels, CompactionStyle style) { + DeleteVersionStorage(); + options_.num_levels = num_levels; + vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, + style, nullptr, false)); + vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); + } + + void DeleteVersionStorage() { + vstorage_.reset(); + files_.clear(); + file_map_.clear(); + input_files_.clear(); + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 1, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, + size_t compensated_file_size = 0) { + assert(level < vstorage_->num_levels()); + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, + InternalKey(smallest, smallest_seq, kTypeValue), + InternalKey(largest, largest_seq, kTypeValue), smallest_seq, + largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + f->compensated_file_size = + (compensated_file_size != 0) ? compensated_file_size : file_size; + vstorage_->AddFile(level, f); + files_.emplace_back(f); + file_map_.insert({file_number, {f, level}}); + } + + void SetCompactionInputFilesLevels(int level_count, int start_level) { + input_files_.resize(level_count); + for (int i = 0; i < level_count; ++i) { + input_files_[i].level = start_level + i; + } + compaction_level_start_ = start_level; + } + + void AddToCompactionFiles(uint32_t file_number) { + auto iter = file_map_.find(file_number); + assert(iter != file_map_.end()); + int level = iter->second.second; + assert(level < vstorage_->num_levels()); + input_files_[level - compaction_level_start_].files.emplace_back( + iter->second.first); + } + + void UpdateVersionStorageInfo() { + vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_->UpdateNumNonEmptyLevels(); + vstorage_->GenerateFileIndexer(); + vstorage_->GenerateLevelFilesBrief(); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->GenerateLevel0NonOverlapping(); + vstorage_->ComputeFilesMarkedForCompaction(); + vstorage_->SetFinalized(); + } +}; + +TEST_F(CompactionPickerTest, Empty) { + NewVersionStorage(6, kCompactionStyleLevel); + UpdateVersionStorageInfo(); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Single) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "p", "q"); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Level0Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger2) { + mutable_cf_options_.target_file_size_base = 10000000000; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000001U); + Add(1, 88U, "201", "300", 1000000000U); + Add(2, 6U, "150", "179", 1000000000U); + Add(2, 7U, "180", "220", 1000000000U); + Add(2, 8U, "221", "300", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, LevelMaxScore) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + Add(0, 1U, "150", "200", 1000000U); + // Level 1 score 1.2 + Add(1, 66U, "150", "200", 6000000U); + Add(1, 88U, "201", "300", 6000000U); + // Level 2 score 1.8. File 7 is the largest. Should be picked + Add(2, 6U, "150", "179", 60000000U); + Add(2, 7U, "180", "220", 60000001U); + Add(2, 8U, "221", "300", 60000000U); + // Level 3 score slightly larger than 1 + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(mutable_cf_options_.target_file_size_base + + mutable_cf_options_.target_file_size_base / 10, + compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, NeedsCompactionLevel) { + const int kLevels = 6; + const int kFileCount = 20; + + for (int level = 0; level < kLevels - 1; ++level) { + NewVersionStorage(kLevels, kCompactionStyleLevel); + uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount; + for (int file_count = 1; file_count <= kFileCount; ++file_count) { + // start a brand new version in each test. + NewVersionStorage(kLevels, kCompactionStyleLevel); + for (int i = 0; i < file_count; ++i) { + Add(level, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + file_size, 0, i * 100, i * 100 + 99); + } + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + // release the version storage + DeleteVersionStorage(); + } + } +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 2); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 2, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 3, 5U, "150", "180", 3U); + Add(num_levels - 3, 6U, "181", "300", 3U); + Add(num_levels - 3, 7U, "400", "450", 3U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(num_levels - 3, compaction->level(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 1, 4U, "400", "450", 3U); + Add(num_levels - 2, 5U, "150", "180", 300U); + Add(num_levels - 2, 6U, "181", "350", 500U); + Add(num_levels - 2, 7U, "400", "450", 200U); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +// Universal and FIFO Compactions are not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker( + ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + // verify the trigger given different number of L0 files. + for (int i = 1; + i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) { + NewVersionStorage(1, kCompactionStyleUniversal); + Add(0, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100, + i * 100 + 99); + UpdateVersionStorageInfo(); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { + const uint64_t kFileSize = 100000; + NewVersionStorage(1, kCompactionStyleUniversal); + ioptions_.allow_ingest_behind = true; + ioptions_.num_levels = 3; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + // output level should be the one above the bottom-most + ASSERT_EQ(1, compaction->output_level()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files overlaps, they cannot +// be trivially moved. + +TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(!compaction->is_trivial_move()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files doesn't overlaps, they should +// be trivially moved. +TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 3U, "301", "350", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(compaction->is_trivial_move()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { + // The case where universal periodic compaction does not + // pick up only level to compact if it doesn't cover + // any file marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { + // The case where universal periodic compaction does not + // pick up only the last sorted run which is an L0 file if it isn't + // marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(0, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { + // The case where universal periodic compaction couldn't form + // a compaction that inlcudes any file marked for periodic compaction. + // Right now we form the compaction anyway if it is more than one + // sorted run. Just put the case here to validate that it doesn't + // crash. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(!compaction || + compaction->start_level() != compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { + // Test single L0 file periodic compaction triggering. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 6U, "150", "200", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { + // Test single sorted run non-L0 periodic compaction + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(4, 5U, "150", "200", kFileSize, 0, 500, 550); + Add(4, 6U, "350", "400", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { + NewVersionStorage(1, kCompactionStyleFIFO); + const int kFileCount = + mutable_cf_options_.level0_file_num_compaction_trigger * 3; + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * kFileCount / 2; + + fifo_options_.max_table_files_size = kMaxSize; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false); + + // verify whether compaction is needed based on the current + // size of L0 files. + uint64_t current_size = 0; + for (int i = 1; i <= kFileCount; ++i) { + NewVersionStorage(1, kCompactionStyleFIFO); + Add(0, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + kFileSize, 0, i * 100, i * 100 + 99); + current_size += kFileSize; + UpdateVersionStorageInfo(); + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} +#endif // ROCKSDB_LITE + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 100000000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); + // Compaction input size * 1.1 + ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smalelst overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 26U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + // Overlaps with file 26, 27. And the file is compensated so will be + // picked up. + Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U); + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28 + + Add(3, 26U, "160", "165", 60000000U); + // Boosted file size in output level is not considered. + Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U); + Add(3, 28U, "180", "400", 60000000U); + Add(3, 29U, "401", "500", 60000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); +} + +// This test exhibits the bug where we don't properly reset parent_index in +// PickCompaction() +TEST_F(CompactionPickerTest, ParentIndexResetBug) { + int num_levels = ioptions_.num_levels; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); // <- marked for compaction + Add(1, 3U, "400", "500", 600); // <- this one needs compacting + Add(2, 4U, "150", "200"); + Add(2, 5U, "201", "210"); + Add(2, 6U, "300", "310"); + Add(2, 7U, "400", "500"); // <- being compacted + + vstorage_->LevelFiles(2)[3]->being_compacted = true; + vstorage_->LevelFiles(0)[0]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); +} + +// This test checks ExpandWhileOverlapping() by having overlapping user keys +// ranges (with different sequence numbers) in the input files. +TEST_F(CompactionPickerTest, OverlappingUserKeys) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kByCompensatedSize; + + Add(1, 1U, "100", "150", 1U); + // Overlapping user keys + Add(1, 2U, "200", "400", 1U); + Add(1, 3U, "400", "500", 1000000000U, 0, 0); + Add(2, 4U, "600", "700", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys2) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys3) { + NewVersionStorage(6, kCompactionStyleLevel); + // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to + // expand multiple times) + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "200", 1U, 0, 0); + Add(1, 3U, "200", "250", 1000000000U, 0, 0); + Add(1, 4U, "250", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys4) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_bytes_for_level_base = 1000000; + + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "199", 1U, 0, 0); + Add(1, 3U, "200", "250", 1100000U, 0, 0); + Add(1, 4U, "251", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + + Add(2, 6U, "100", "115", 1U); + Add(2, 7U, "125", "325", 1U); + Add(2, 8U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys5) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(2)[2]->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys6) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "300", 1U, 0, 0); + Add(2, 5U, "305", "450", 1U, 0, 0); + Add(2, 6U, "460", "600", 1U, 0, 0); + Add(2, 7U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(1)[0]->marked_for_compaction = true; + vstorage_->LevelFiles(1)[1]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys7) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1000000000U, 0, 0); + Add(2, 3U, "100", "250", 1U); + Add(2, 4U, "300", "600", 1U, 0, 0); + Add(2, 5U, "600", "800", 1U, 0, 0); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_GE(1U, compaction->num_input_files(0)); + ASSERT_GE(2U, compaction->num_input_files(1)); + // File 5 has to be included in the compaction + ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys8) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // no overlapping case + Add(1, 1U, "101", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "150", "200", 1U); + Add(2, 7U, "200", "450", 1U, 0, 0); + Add(2, 8U, "500", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys9) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // overlapping case + Add(1, 1U, "121", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "100", "120", 1U); + Add(2, 7U, "150", "200", 1U); + Add(2, 8U, "200", "450", 1U, 0, 0); + Add(2, 9U, "501", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys10) { + // Locked file encountered when pulling in extra input-level files with same + // user keys. Verify we pick the next-best file from the same input level. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // file_number 2U is largest and thus first choice. But it overlaps with + // file_number 1U which is being compacted. So instead we pick the next- + // biggest file, 3U, which is eligible for compaction. + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 900000000U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys11) { + // Locked file encountered when pulling in extra output-level files with same + // user keys. Expected to skip that compaction and pick the next-best choice. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // score(L1) = 3.7 + // score(L2) = 1.85 + // There is no eligible file in L1 to compact since both candidates pull in + // file_number 5U, which overlaps with a file pending compaction (6U). The + // first eligible compaction is from L2->L3. + Add(1 /* level */, 2U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "149" /* largest */, 5000000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "201" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + file_map_[6U].first->being_compacted = true; + Add(3 /* level */, 7U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // No compaction should be scheduled, if L0 has higher priority than L1 + // but L0->L1 compaction is blocked by a file in L1 being compacted. + UpdateVersionStorageInfo(); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If no file in L1 being compacted, L0->L1 compaction will be scheduled. + UpdateVersionStorageInfo(); // being_compacted flag is cleared here. + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 score more than 6. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + Add(1, 51U, "351", "400", 6000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If score in L1 is larger than L0, L1 compaction goes through despite + // there is pending L0 compaction. + UpdateVersionStorageInfo(); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 3U, "150", "200", 200); + // Level 1 is over target by 200 + Add(1, 4U, "400", "500", 600); + Add(1, 5U, "600", "700", 600); + // Level 2 is less than target 10000 even added size of level 1 + // Size ratio of L2/L1 is 9600 / 1200 = 8 + Add(2, 6U, "150", "200", 2500); + Add(2, 7U, "201", "210", 2000); + Add(2, 8U, "300", "310", 2600); + Add(2, 9U, "400", "500", 2500); + // Level 3 exceeds target 100,000 of 1000 + Add(3, 10U, "400", "500", 101000); + // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3 + // Size ratio L4/L3 is 9.9 + // After merge from L3, L4 size is 1000900 + Add(4, 11U, "400", "500", 999900); + Add(5, 11U, "400", "500", 8007200); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(200u * 9u + 10900u + 900u * 9, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // Level 1 size will be 1400 after merging with L0 + Add(1, 7U, "400", "500", 200); + Add(1, 8U, "600", "700", 200); + // Level 2 is less than target 10000 even added size of level 1 + Add(2, 9U, "150", "200", 9100); + // Level 3 over the target, but since level 4 is empty, we assume it will be + // a trivial move. + Add(3, 10U, "400", "500", 101000); + + UpdateVersionStorageInfo(); + + // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0) + ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 2000); + Add(0, 2U, "150", "200", 2000); + Add(0, 4U, "150", "200", 2000); + Add(0, 5U, "150", "200", 2000); + Add(0, 6U, "150", "200", 1000); + // Level 1 size will be 10000 after merging with L0 + Add(1, 7U, "400", "500", 500); + Add(1, 8U, "600", "700", 500); + + Add(2, 9U, "150", "200", 10000); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + // Set Last level size 50000 + // num_levels - 1 target 5000 + // num_levels - 2 is base level with target 1000 (rounded up to + // max_bytes_for_level_base). + Add(num_levels - 1, 10U, "400", "500", 50000); + + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // num_levels - 3 is over target by 100 + 1000 + Add(num_levels - 3, 7U, "400", "500", 550); + Add(num_levels - 3, 8U, "600", "700", 550); + // num_levels - 2 is over target by 1100 + 200 + Add(num_levels - 2, 9U, "150", "200", 5200); + + UpdateVersionStorageInfo(); + + // Merging to the second last level: (5200 / 2100 + 1) * 1100 + // Merging to the last level: (50000 / 6300 + 1) * 1300 + ASSERT_EQ(2100u + 3823u + 11617u, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, IsBottommostLevelTest) { + // case 1: Higher levels are empty + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + bool result = + Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 2: Higher levels have no overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "k", "p"); + Add(3, 8U, "t", "w"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 3.1: Higher levels (level 3) have overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "e", "g"); + Add(3, 8U, "h", "k"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.2: Higher levels (level 5) have overlap + DeleteVersionStorage(); + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "h", "k"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping + // one key ("d") + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "ccc", "d"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "z"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files don't overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // Level 1 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(1, 5U, "a", "m"); + Add(1, 6U, "n", "o"); + Add(1, 7U, "w", "y"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + AddToCompactionFiles(5U); + AddToCompactionFiles(6U); + AddToCompactionFiles(7U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 800000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // It can expand because adding file 1 and 3, the compaction size will + // exceed mutable_cf_options_.max_bytes_for_level_base. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "100", "256", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { + mutable_cf_options_.max_bytes_for_level_base = 800000u; + mutable_cf_options_.max_compaction_bytes = 1000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // and it expands to file 1 and 3 too. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "000", "251", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOn) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 5U, "220", "230", 7000U); + Add(3, 5U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOff) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 10000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick all files from level 1 + Add(1, 1U, "100", "150", 300000U, 0, 0); + Add(1, 2U, "150", "200", 300000U, 0, 0); + Add(1, 3U, "200", "250", 300000U, 0, 0); + Add(1, 4U, "250", "300", 300000U, 0, 0); + + Add(3, 5U, "120", "130", 6000U); + Add(3, 6U, "140", "150", 6000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_FALSE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1000000000U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "199" /* largest */, 900000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "200" /* smallest */, + "249" /* largest */, 800000000U /* file_size */); + Add(1 /* level */, 4U /* file_number */, "250" /* smallest */, + "299" /* largest */, 700000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "199" /* largest */, 1U /* file_size */); + file_map_[5U].first->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(0U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(0U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); + ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 1000000u; + NewVersionStorage(6, kCompactionStyleLevel); + + // All 5 L0 files will be picked for intra L0 compaction. The one L1 file + // spans entire L0 key range and is marked as being compacted to avoid + // L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 110, 111); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 5 L0 files will be picked for intra L0 compaction due to + // max_compaction_bytes limit (the minimum number of files for triggering + // intra L0 compaction is 4). The one L1 file spans entire L0 key range and + // is marked as being compacted to avoid L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 109, 110); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 6 L0 files will be picked for intra L0 compaction due to + // being_compact limit. And the latest one L0 will be skipped due to earliest + // seqno. The one L1 file spans entire L0 key range and is marked as being + // compacted to avoid L0->L1 compaction. + Add(1, 1U, "100", "350", 200000U, 0, 110, 111); + Add(0, 2U, "301", "350", 1U, 0, 108, 109); + Add(0, 3U, "251", "300", 1U, 0, 106, 107); + Add(0, 4U, "201", "250", 1U, 0, 104, 105); + Add(0, 5U, "151", "200", 1U, 0, 102, 103); + Add(0, 6U, "100", "150", 1U, 0, 100, 101); + Add(0, 7U, "100", "100", 1U, 0, 99, 100); + vstorage_->LevelFiles(0)[5]->being_compacted = true; + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc new file mode 100644 index 000000000..d8b63956e --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc @@ -0,0 +1,1105 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_universal.h" +#ifndef ROCKSDB_LITE + +#include <cinttypes> +#include <limits> +#include <queue> +#include <string> +#include <utility> +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// A helper class that form universal compactions. The class is used by +// UniversalCompactionPicker::PickCompaction(). +// The usage is to create the class, and get the compaction object by calling +// PickCompaction(). +class UniversalCompactionBuilder { + public: + UniversalCompactionBuilder(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp, + const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, + LogBuffer* log_buffer) + : ioptions_(ioptions), + icmp_(icmp), + cf_name_(cf_name), + mutable_cf_options_(mutable_cf_options), + vstorage_(vstorage), + picker_(picker), + log_buffer_(log_buffer) {} + + // Form and return the compaction object. The caller owns return object. + Compaction* PickCompaction(); + + private: + struct SortedRun { + SortedRun(int _level, FileMetaData* _file, uint64_t _size, + uint64_t _compensated_file_size, bool _being_compacted) + : level(_level), + file(_file), + size(_size), + compensated_file_size(_compensated_file_size), + being_compacted(_being_compacted) { + assert(compensated_file_size > 0); + assert(level != 0 || file != nullptr); + } + + void Dump(char* out_buf, size_t out_buf_size, + bool print_path = false) const; + + // sorted_run_count is added into the string to print + void DumpSizeInfo(char* out_buf, size_t out_buf_size, + size_t sorted_run_count) const; + + int level; + // `file` Will be null for level > 0. For level = 0, the sorted run is + // for this file. + FileMetaData* file; + // For level > 0, `size` and `compensated_file_size` are sum of sizes all + // files in the level. `being_compacted` should be the same for all files + // in a non-zero level. Use the value here. + uint64_t size; + uint64_t compensated_file_size; + bool being_compacted; + }; + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionToReduceSizeAmp(); + + Compaction* PickDeleteTriggeredCompaction(); + + // Form a compaction from the sorted run indicated by start_index to the + // oldest sorted run. + // The caller is responsible for making sure that those files are not in + // compaction. + Compaction* PickCompactionToOldest(size_t start_index, + CompactionReason compaction_reason); + + // Try to pick periodic compaction. The caller should only call it + // if there is at least one file marked for periodic compaction. + // null will be returned if no such a compaction can be formed + // because some files are being compacted. + Compaction* PickPeriodicCompaction(); + + // Used in universal compaction when the enabled_trivial_move + // option is set. Checks whether there are any overlapping files + // in the input. Returns true if the input files are non + // overlapping. + bool IsInputFilesNonOverlapping(Compaction* c); + + const ImmutableCFOptions& ioptions_; + const InternalKeyComparator* icmp_; + double score_; + std::vector<SortedRun> sorted_runs_; + const std::string& cf_name_; + const MutableCFOptions& mutable_cf_options_; + VersionStorageInfo* vstorage_; + UniversalCompactionPicker* picker_; + LogBuffer* log_buffer_; + + static std::vector<SortedRun> CalculateSortedRuns( + const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + + // Pick a path ID to place a newly generated file, with its estimated file + // size. + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + uint64_t file_size); +}; + +// Used in universal compaction when trivial move is enabled. +// This structure is used for the construction of min heap +// that contains the file meta data, the level of the file +// and the index of the file in that level + +struct InputFileInfo { + InputFileInfo() : f(nullptr), level(0), index(0) {} + + FileMetaData* f; + size_t level; + size_t index; +}; + +// Used in universal compaction when trivial move is enabled. +// This comparator is used for the construction of min heap +// based on the smallest key of the file. +struct SmallestKeyHeapComparator { + explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; } + + bool operator()(InputFileInfo i1, InputFileInfo i2) const { + return (ucmp_->Compare(i1.f->smallest.user_key(), + i2.f->smallest.user_key()) > 0); + } + + private: + const Comparator* ucmp_; +}; + +typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>, + SmallestKeyHeapComparator> + SmallestKeyHeap; + +// This function creates the heap that is used to find if the files are +// overlapping during universal compaction when the allow_trivial_move +// is set. +SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { + SmallestKeyHeap smallest_key_priority_q = + SmallestKeyHeap(SmallestKeyHeapComparator(ucmp)); + + InputFileInfo input_file; + + for (size_t l = 0; l < c->num_input_levels(); l++) { + if (c->num_input_files(l) != 0) { + if (l == 0 && c->start_level() == 0) { + for (size_t i = 0; i < c->num_input_files(0); i++) { + input_file.f = c->input(0, i); + input_file.level = 0; + input_file.index = i; + smallest_key_priority_q.push(std::move(input_file)); + } + } else { + input_file.f = c->input(l, 0); + input_file.level = l; + input_file.index = 0; + smallest_key_priority_q.push(std::move(input_file)); + } + } + } + return smallest_key_priority_q; +} + +#ifndef NDEBUG +// smallest_seqno and largest_seqno are set iff. `files` is not empty. +void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files, + SequenceNumber* smallest_seqno, + SequenceNumber* largest_seqno) { + bool is_first = true; + for (FileMetaData* f : files) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + *smallest_seqno = f->fd.smallest_seqno; + *largest_seqno = f->fd.largest_seqno; + } else { + if (f->fd.smallest_seqno < *smallest_seqno) { + *smallest_seqno = f->fd.smallest_seqno; + } + if (f->fd.largest_seqno > *largest_seqno) { + *largest_seqno = f->fd.largest_seqno; + } + } + } +} +#endif +} // namespace + +// Algorithm that checks to see if there are any overlapping +// files in the input +bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { + auto comparator = icmp_->user_comparator(); + int first_iter = 1; + + InputFileInfo prev, curr, next; + + SmallestKeyHeap smallest_key_priority_q = + create_level_heap(c, icmp_->user_comparator()); + + while (!smallest_key_priority_q.empty()) { + curr = smallest_key_priority_q.top(); + smallest_key_priority_q.pop(); + + if (first_iter) { + prev = curr; + first_iter = 0; + } else { + if (comparator->Compare(prev.f->largest.user_key(), + curr.f->smallest.user_key()) >= 0) { + // found overlapping files, return false + return false; + } + assert(comparator->Compare(curr.f->largest.user_key(), + prev.f->largest.user_key()) > 0); + prev = curr; + } + + next.f = nullptr; + + if (c->level(curr.level) != 0 && + curr.index < c->num_input_files(curr.level) - 1) { + next.f = c->input(curr.level, curr.index + 1); + next.level = curr.level; + next.index = curr.index + 1; + } + + if (next.f) { + smallest_key_priority_q.push(std::move(next)); + } + } + return true; +} + +bool UniversalCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + if (vstorage->CompactionScore(kLevel0) >= 1) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + return false; +} + +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber /* earliest_memtable_seqno */) { + UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, + mutable_cf_options, vstorage, this, + log_buffer); + return builder.PickCompaction(); +} + +void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, + size_t out_buf_size, + bool print_path) const { + if (level == 0) { + assert(file != nullptr); + if (file->fd.GetPathId() == 0 || !print_path) { + snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber()); + } else { + snprintf(out_buf, out_buf_size, "file %" PRIu64 + "(path " + "%" PRIu32 ")", + file->fd.GetNumber(), file->fd.GetPathId()); + } + } else { + snprintf(out_buf, out_buf_size, "level %d", level); + } +} + +void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( + char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { + if (level == 0) { + assert(file != nullptr); + snprintf(out_buf, out_buf_size, + "file %" PRIu64 "[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(), + file->compensated_file_size); + } else { + snprintf(out_buf, out_buf_size, + "level %d[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + level, sorted_run_count, size, compensated_file_size); + } +} + +std::vector<UniversalCompactionBuilder::SortedRun> +UniversalCompactionBuilder::CalculateSortedRuns( + const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/, + const MutableCFOptions& mutable_cf_options) { + std::vector<UniversalCompactionBuilder::SortedRun> ret; + for (FileMetaData* f : vstorage.LevelFiles(0)) { + ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, + f->being_compacted); + } + for (int level = 1; level < vstorage.num_levels(); level++) { + uint64_t total_compensated_size = 0U; + uint64_t total_size = 0U; + bool being_compacted = false; + bool is_first = true; + for (FileMetaData* f : vstorage.LevelFiles(level)) { + total_compensated_size += f->compensated_file_size; + total_size += f->fd.GetFileSize(); + if (mutable_cf_options.compaction_options_universal.allow_trivial_move == + true) { + if (f->being_compacted) { + being_compacted = f->being_compacted; + } + } else { + // Compaction always includes all files for a non-zero level, so for a + // non-zero level, all the files should share the same being_compacted + // value. + // This assumption is only valid when + // mutable_cf_options.compaction_options_universal.allow_trivial_move + // is false + assert(is_first || f->being_compacted == being_compacted); + } + if (is_first) { + being_compacted = f->being_compacted; + is_first = false; + } + } + if (total_compensated_size > 0) { + ret.emplace_back(level, nullptr, total_size, total_compensated_size, + being_compacted); + } + } + return ret; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +Compaction* UniversalCompactionBuilder::PickCompaction() { + const int kLevel0 = 0; + score_ = vstorage_->CompactionScore(kLevel0); + sorted_runs_ = + CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_); + + if (sorted_runs_.size() == 0 || + (vstorage_->FilesMarkedForPeriodicCompaction().empty() && + vstorage_->FilesMarkedForCompaction().empty() && + sorted_runs_.size() < (unsigned int)mutable_cf_options_ + .level0_file_num_compaction_trigger)) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n", + cf_name_.c_str()); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER_MAX_SZ( + log_buffer_, 3072, + "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n", + cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); + + Compaction* c = nullptr; + // Periodic compaction has higher priority than other type of compaction + // because it's a hard requirement. + if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + // Always need to do a full compaction for periodic compaction. + c = PickPeriodicCompaction(); + } + + // Check for size amplification. + if (c == nullptr && + sorted_runs_.size() >= + static_cast<size_t>( + mutable_cf_options_.level0_file_num_compaction_trigger)) { + if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", + cf_name_.c_str()); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = + mutable_cf_options_.compaction_options_universal.size_ratio; + + if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for size ratio\n", + cf_name_.c_str()); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + // This is guaranteed by NeedsCompaction() + assert(sorted_runs_.size() >= + static_cast<size_t>( + mutable_cf_options_.level0_file_num_compaction_trigger)); + // Get the total number of sorted runs that are not being compacted + int num_sr_not_compacted = 0; + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false) { + num_sr_not_compacted++; + } + } + + // The number of sorted runs that are not being compacted is greater + // than the maximum allowed number of sorted runs + if (num_sr_not_compacted > + mutable_cf_options_.level0_file_num_compaction_trigger) { + unsigned int num_files = + num_sr_not_compacted - + mutable_cf_options_.level0_file_num_compaction_trigger + 1; + if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != + nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for file num -- %u\n", + cf_name_.c_str(), num_files); + } + } + } + } + } + + if (c == nullptr) { + if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: delete triggered compaction\n", + cf_name_.c_str()); + } + } + + if (c == nullptr) { + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + + if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == + true && + c->compaction_reason() != CompactionReason::kPeriodicCompaction) { + c->set_is_trivial_move(IsInputFilesNonOverlapping(c)); + } + +// validate that all the chosen files of L0 are non overlapping in time +#ifndef NDEBUG + SequenceNumber prev_smallest_seqno = 0U; + bool is_first = true; + + size_t level_index = 0U; + if (c->start_level() == 0) { + for (auto f : *c->inputs(0)) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + } + prev_smallest_seqno = f->fd.smallest_seqno; + } + level_index = 1U; + } + for (; level_index < c->num_input_levels(); level_index++) { + if (c->num_input_files(level_index) != 0) { + SequenceNumber smallest_seqno = 0U; + SequenceNumber largest_seqno = 0U; + GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno, + &largest_seqno); + if (is_first) { + is_first = false; + } else if (prev_smallest_seqno > 0) { + // A level is considered as the bottommost level if there are + // no files in higher levels or if files in higher levels do + // not overlap with the files being compacted. Sequence numbers + // of files in bottommost level can be set to 0 to help + // compression. As a result, the following assert may not hold + // if the prev_smallest_seqno is 0. + assert(prev_smallest_seqno > largest_seqno); + } + prev_smallest_seqno = smallest_seqno; + } + } +#endif + // update statistics + RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + + picker_->RegisterCompaction(c); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + + TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", + c); + return c; +} + +uint32_t UniversalCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, uint64_t file_size) { + // Two conditions need to be satisfied: + // (1) the target path needs to be able to hold the file's size + // (2) Total size left in this and previous paths need to be not + // smaller than expected future file size before this new file is + // compacted, which is estimated based on size_ratio. + // For example, if now we are compacting files of size (1, 1, 2, 4, 8), + // we will make sure the target file, probably with size of 16, will be + // placed in a path so that eventually when new files are generated and + // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or + // before the path we chose. + // + // TODO(sdong): now the case of multiple column families is not + // considered in this algorithm. So the target size can be violated in + // that case. We need to improve it. + uint64_t accumulated_size = 0; + uint64_t future_size = + file_size * + (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100; + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + for (; p < ioptions.cf_paths.size() - 1; p++) { + uint64_t target_size = ioptions.cf_paths[p].target_size; + if (target_size > file_size && + accumulated_size + (target_size - file_size) > future_size) { + return p; + } + accumulated_size += target_size; + } + return p; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact) { + unsigned int min_merge_width = + mutable_cf_options_.compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + mutable_cf_options_.compaction_options_universal.max_merge_width; + + const SortedRun* sr = nullptr; + bool done = false; + size_t start_index = 0; + unsigned int candidate_count = 0; + + unsigned int max_files_to_compact = + std::min(max_merge_width, max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Caller checks the size before executing this function. This invariant is + // important because otherwise we may have a possible integer underflow when + // dealing with unsigned types. + assert(sorted_runs_.size() > 0); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (size_t loop = 0; loop < sorted_runs_.size(); loop++) { + candidate_count = 0; + + // Skip files that are already being compacted + for (sr = nullptr; loop < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + + if (!sr->being_compacted) { + candidate_count = 1; + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] being compacted, skipping", + cf_name_.c_str(), file_num_buf, loop); + + sr = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0; + if (sr != nullptr) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Possible candidate %s[%d].", + cf_name_.c_str(), file_num_buf, loop); + } + + // Check if the succeeding files need compaction. + for (size_t i = loop + 1; + candidate_count < max_files_to_compact && i < sorted_runs_.size(); + i++) { + const SortedRun* succeeding_sr = &sorted_runs_[i]; + if (succeeding_sr->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast<double>(succeeding_sr->size)) { + break; + } + if (mutable_cf_options_.compaction_options_universal.stop_style == + kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0; + if (sz < static_cast<double>(candidate_size)) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = succeeding_sr->compensated_file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += succeeding_sr->compensated_file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (size_t i = loop; + i < loop + candidate_count && i < sorted_runs_.size(); i++) { + const SortedRun* skipping_sr = &sorted_runs_[i]; + char file_num_buf[256]; + skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s", + cf_name_.c_str(), file_num_buf); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + size_t first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + mutable_cf_options_.compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = 0; + for (auto& sorted_run : sorted_runs_) { + total_size += sorted_run.compensated_file_size; + } + + uint64_t older_file_size = 0; + for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) { + older_file_size += sorted_runs_[i].size; + if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { + enable_compression = false; + break; + } + } + } + + uint64_t estimated_total_size = 0; + for (unsigned int i = 0; i < first_index_after; i++) { + estimated_total_size += sorted_runs_[i].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + int output_level; + if (first_index_after == sorted_runs_.size()) { + output_level = vstorage_->num_levels() - 1; + } else if (sorted_runs_[first_index_after].level == 0) { + output_level = 0; + } else { + output_level = sorted_runs_[first_index_after].level - 1; + } + + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind && + (output_level == vstorage_->num_levels() - 1)) { + assert(output_level > 1); + output_level--; + } + + std::vector<CompactionInputFiles> inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast<int>(i); + } + for (size_t i = start_index; i < first_index_after; i++) { + auto& picking_sr = sorted_runs_[i]; + if (picking_sr.level == 0) { + FileMetaData* picking_file = picking_sr.file; + inputs[0].files.push_back(picking_file); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s", + cf_name_.c_str(), file_num_buf); + } + + CompactionReason compaction_reason; + if (max_number_of_files_to_compact == UINT_MAX) { + compaction_reason = CompactionReason::kUniversalSizeRatio; + } else { + compaction_reason = CompactionReason::kUniversalSortedRunNum; + } + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, + 1, enable_compression), + GetCompressionOptions(ioptions_, vstorage_, start_level, + enable_compression), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + score_, false /* deletion_compaction */, compaction_reason); +} + +// Look at overall size amplification. If size amplification +// exceeeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { + // percentage flexibility while reducing size amplification + uint64_t ratio = mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + size_t start_index = 0; + const SortedRun* sr = nullptr; + + assert(!sorted_runs_.empty()); + if (sorted_runs_.back().being_compacted) { + return nullptr; + } + + // Skip files that are already being compacted + for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) { + sr = &sorted_runs_[loop]; + if (!sr->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: skipping %s[%d] compacted %s", + cf_name_.c_str(), file_num_buf, loop, + " cannot be a candidate to reduce size amp.\n"); + sr = nullptr; + } + + if (sr == nullptr) { + return nullptr; // no candidate files + } + { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", + cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); + } + + // keep adding up all the remaining files + for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) { + sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", + cf_name_.c_str(), file_num_buf, start_index, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += sr->compensated_file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size of earliest file + uint64_t earliest_file_size = sorted_runs_.back().size; + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * earliest_file_size) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, earliest_file_size); + return nullptr; + } else { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, earliest_file_size); + } + return PickCompactionToOldest(start_index, + CompactionReason::kUniversalSizeAmplification); +} + +// Pick files marked for compaction. Typically, files are marked by +// CompactOnDeleteCollector due to the presence of tombstones. +Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { + CompactionInputFiles start_level_inputs; + int output_level; + std::vector<CompactionInputFiles> inputs; + + if (vstorage_->num_levels() == 1) { + // This is single level universal. Since we're basically trying to reclaim + // space by processing files marked for compaction due to high tombstone + // density, let's do the same thing as compaction to reduce size amp which + // has the same goals. + bool compact = false; + + start_level_inputs.level = 0; + start_level_inputs.files.clear(); + output_level = 0; + for (FileMetaData* f : vstorage_->LevelFiles(0)) { + if (f->marked_for_compaction) { + compact = true; + } + if (compact) { + start_level_inputs.files.push_back(f); + } + } + if (start_level_inputs.size() <= 1) { + // If only the last file in L0 is marked for compaction, ignore it + return nullptr; + } + inputs.push_back(start_level_inputs); + } else { + int start_level; + + // For multi-level universal, the strategy is to make this look more like + // leveled. We pick one of the files marked for compaction and compact with + // overlapping files in the adjacent level. + picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, + &output_level, &start_level_inputs); + if (start_level_inputs.empty()) { + return nullptr; + } + + // Pick the first non-empty level after the start_level + for (output_level = start_level + 1; output_level < vstorage_->num_levels(); + output_level++) { + if (vstorage_->NumLevelFiles(output_level) != 0) { + break; + } + } + + // If all higher levels are empty, pick the highest level as output level + if (output_level == vstorage_->num_levels()) { + if (start_level == 0) { + output_level = vstorage_->num_levels() - 1; + } else { + // If start level is non-zero and all higher levels are empty, this + // compaction will translate into a trivial move. Since the idea is + // to reclaim space and trivial move doesn't help with that, we + // skip compaction in this case and return nullptr + return nullptr; + } + } + if (ioptions_.allow_ingest_behind && + output_level == vstorage_->num_levels() - 1) { + assert(output_level > 1); + output_level--; + } + + if (output_level != 0) { + if (start_level == 0) { + if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, + output_level, nullptr)) { + return nullptr; + } + } + + CompactionInputFiles output_level_inputs; + int parent_index = -1; + + output_level_inputs.level = output_level; + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &start_level_inputs, &output_level_inputs, + &parent_index, -1)) { + return nullptr; + } + inputs.push_back(start_level_inputs); + if (!output_level_inputs.empty()) { + inputs.push_back(output_level_inputs); + } + if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) { + return nullptr; + } + } else { + inputs.push_back(start_level_inputs); + } + } + + uint64_t estimated_total_size = 0; + // Use size of the output level as estimated file size + for (FileMetaData* f : vstorage_->LevelFiles(output_level)) { + estimated_total_size += f->fd.GetFileSize(); + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level, 1), + GetCompressionOptions(ioptions_, vstorage_, output_level), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true, + score_, false /* deletion_compaction */, + CompactionReason::kFilesMarkedForCompaction); +} + +Compaction* UniversalCompactionBuilder::PickCompactionToOldest( + size_t start_index, CompactionReason compaction_reason) { + assert(start_index < sorted_runs_.size()); + + // Estimate total file size + uint64_t estimated_total_size = 0; + for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + estimated_total_size += sorted_runs_[loop].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + + std::vector<CompactionInputFiles> inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast<int>(i); + } + for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + auto& picking_sr = sorted_runs_[loop]; + if (picking_sr.level == 0) { + FileMetaData* f = picking_sr.file; + inputs[0].files.push_back(f); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + std::string comp_reason_print_string; + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + comp_reason_print_string = "periodic compaction"; + } else if (compaction_reason == + CompactionReason::kUniversalSizeAmplification) { + comp_reason_print_string = "size amp"; + } else { + assert(false); + } + + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s", + cf_name_.c_str(), comp_reason_print_string.c_str(), + file_num_buf); + } + + // output files at the bottom most level, unless it's reserved + int output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + + // We never check size for + // compaction_options_universal.compression_size_percent, + // because we always compact all the files, so always compress. + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, + 1, true /* enable_compression */), + GetCompressionOptions(ioptions_, vstorage_, start_level, + true /* enable_compression */), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + score_, false /* deletion_compaction */, compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction", + cf_name_.c_str()); + + // In universal compaction, sorted runs contain older data are almost always + // generated earlier too. To simplify the problem, we just try to trigger + // a full compaction. We start from the oldest sorted run and include + // all sorted runs, until we hit a sorted already being compacted. + // Since usually the largest (which is usually the oldest) sorted run is + // included anyway, doing a full compaction won't increase write + // amplification much. + + // Get some information from marked files to check whether a file is + // included in the compaction. + + size_t start_index = sorted_runs_.size(); + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + start_index--; + } + if (start_index == sorted_runs_.size()) { + return nullptr; + } + + // There is a rare corner case where we can't pick up all the files + // because some files are being compacted and we end up with picking files + // but none of them need periodic compaction. Unless we simply recompact + // the last sorted run (either the last level or last L0 file), we would just + // execute the compaction, in order to simplify the logic. + if (start_index == sorted_runs_.size() - 1) { + bool included_file_marked = false; + int start_level = sorted_runs_[start_index].level; + FileMetaData* start_file = sorted_runs_[start_index].file; + for (const std::pair<int, FileMetaData*>& level_file_pair : + vstorage_->FilesMarkedForPeriodicCompaction()) { + if (start_level != 0) { + // Last sorted run is a level + if (start_level == level_file_pair.first) { + included_file_marked = true; + break; + } + } else { + // Last sorted run is a L0 file. + if (start_file == level_file_pair.second) { + included_file_marked = true; + break; + } + } + } + if (!included_file_marked) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Cannot form a compaction covering file " + "marked for periodic compaction", + cf_name_.c_str()); + return nullptr; + } + } + + Compaction* c = PickCompactionToOldest(start_index, + CompactionReason::kPeriodicCompaction); + + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", c); + + return c; +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h new file mode 100644 index 000000000..c3f55f5d3 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.h @@ -0,0 +1,31 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE |