summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/compaction
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/db/compaction')
-rw-r--r--src/rocksdb/db/compaction/compaction.cc564
-rw-r--r--src/rocksdb/db/compaction/compaction.h384
-rw-r--r--src/rocksdb/db/compaction/compaction_iteration_stats.h37
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.cc774
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.h240
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator_test.cc976
-rw-r--r--src/rocksdb/db/compaction/compaction_job.cc1700
-rw-r--r--src/rocksdb/db/compaction/compaction_job.h198
-rw-r--r--src/rocksdb/db/compaction/compaction_job_stats_test.cc1043
-rw-r--r--src/rocksdb/db/compaction/compaction_job_test.cc1082
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.cc1131
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.h313
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.cc244
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.h53
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.cc558
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.h32
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_test.cc1741
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.cc1105
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.h31
19 files changed, 12206 insertions, 0 deletions
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..5c34fdcaa
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,564 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "rocksdb/compaction_filter.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+ PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b) {
+ auto c = user_cmp->Compare(a.user_key(), b.user_key());
+ if (c != 0) {
+ return c;
+ }
+ auto a_footer = ExtractInternalKeyFooter(a.Encode());
+ auto b_footer = ExtractInternalKeyFooter(b.Encode());
+ if (a_footer == kRangeTombstoneSentinel) {
+ if (b_footer != kRangeTombstoneSentinel) {
+ return -1;
+ }
+ } else if (b_footer == kRangeTombstoneSentinel) {
+ return 1;
+ }
+ return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b) {
+ if (a == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b) {
+ if (b == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->fd.GetFileSize();
+ }
+ return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+ input_version_ = _input_version;
+ cfd_ = input_version_->cfd();
+
+ cfd_->Ref();
+ input_version_->Ref();
+ edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+ Slice* largest_user_key) {
+ bool initialized = false;
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i].files.empty()) {
+ continue;
+ }
+ if (inputs[i].level == 0) {
+ // we need to consider all files on level 0
+ for (const auto* f : inputs[i].files) {
+ const Slice& start_user_key = f->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = f->largest.user_key();
+ if (!initialized ||
+ ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ } else {
+ // we only need to consider the first and last file
+ const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+ if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ if (inputs[i].level == 0 || inputs[i].files.empty()) {
+ continue;
+ }
+ inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+ AtomicCompactionUnitBoundary cur_boundary;
+ size_t first_atomic_idx = 0;
+ auto add_unit_boundary = [&](size_t to) {
+ if (first_atomic_idx == to) return;
+ for (size_t k = first_atomic_idx; k < to; k++) {
+ inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+ }
+ first_atomic_idx = to;
+ };
+ for (size_t j = 0; j < inputs[i].files.size(); j++) {
+ const auto* f = inputs[i].files[j];
+ if (j == 0) {
+ // First file in a level.
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+ 0) {
+ // SSTs overlap but the end key of the previous file was not
+ // artificially extended by a range tombstone. Extend the current
+ // boundary.
+ cur_boundary.largest = &f->largest;
+ } else {
+ // Atomic compaction unit has ended.
+ add_unit_boundary(j);
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ }
+ }
+ add_unit_boundary(inputs[i].files.size());
+ assert(inputs[i].files.size() ==
+ inputs[i].atomic_compaction_unit_boundaries.size());
+ }
+ return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ int output_l0_idx;
+ if (output_level == 0) {
+ output_l0_idx = 0;
+ for (const auto* file : vstorage->LevelFiles(0)) {
+ if (inputs[0].files.back() == file) {
+ break;
+ }
+ ++output_l0_idx;
+ }
+ assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+ } else {
+ output_l0_idx = -1;
+ }
+ Slice smallest_key, largest_key;
+ GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+ return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+ output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ size_t num_files_in_compaction = 0;
+ size_t total_num_files = 0;
+ for (int l = 0; l < vstorage->num_levels(); l++) {
+ total_num_files += vstorage->NumLevelFiles(l);
+ }
+ for (size_t i = 0; i < inputs.size(); i++) {
+ num_files_in_compaction += inputs[i].size();
+ }
+ return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(VersionStorageInfo* vstorage,
+ const ImmutableCFOptions& _immutable_cf_options,
+ const MutableCFOptions& _mutable_cf_options,
+ std::vector<CompactionInputFiles> _inputs,
+ int _output_level, uint64_t _target_file_size,
+ uint64_t _max_compaction_bytes, uint32_t _output_path_id,
+ CompressionType _compression,
+ CompressionOptions _compression_opts,
+ uint32_t _max_subcompactions,
+ std::vector<FileMetaData*> _grandparents,
+ bool _manual_compaction, double _score,
+ bool _deletion_compaction,
+ CompactionReason _compaction_reason)
+ : input_vstorage_(vstorage),
+ start_level_(_inputs[0].level),
+ output_level_(_output_level),
+ max_output_file_size_(_target_file_size),
+ max_compaction_bytes_(_max_compaction_bytes),
+ max_subcompactions_(_max_subcompactions),
+ immutable_cf_options_(_immutable_cf_options),
+ mutable_cf_options_(_mutable_cf_options),
+ input_version_(nullptr),
+ number_levels_(vstorage->num_levels()),
+ cfd_(nullptr),
+ output_path_id_(_output_path_id),
+ output_compression_(_compression),
+ output_compression_opts_(_compression_opts),
+ deletion_compaction_(_deletion_compaction),
+ inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+ grandparents_(std::move(_grandparents)),
+ score_(_score),
+ bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+ is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+ is_manual_compaction_(_manual_compaction),
+ is_trivial_move_(false),
+ compaction_reason_(_compaction_reason) {
+ MarkFilesBeingCompacted(true);
+ if (is_manual_compaction_) {
+ compaction_reason_ = CompactionReason::kManualCompaction;
+ }
+ if (max_subcompactions_ == 0) {
+ max_subcompactions_ = immutable_cf_options_.max_subcompactions;
+ }
+ if (!bottommost_level_) {
+ // Currently we only enable dictionary compression during compaction to the
+ // bottommost level.
+ output_compression_opts_.max_dict_bytes = 0;
+ output_compression_opts_.zstd_max_train_bytes = 0;
+ }
+
+#ifndef NDEBUG
+ for (size_t i = 1; i < inputs_.size(); ++i) {
+ assert(inputs_[i].level > inputs_[i - 1].level);
+ }
+#endif
+
+ // setup input_levels_
+ {
+ input_levels_.resize(num_input_levels());
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+ &arena_);
+ }
+ }
+
+ GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != nullptr) {
+ input_version_->Unref();
+ }
+ if (cfd_ != nullptr) {
+ cfd_->UnrefAndTryDelete();
+ }
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+ int base_level = input_vstorage_->base_level();
+ bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_,
+ mutable_cf_options_, start_level_,
+ base_level) == output_compression_);
+ if (matches) {
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+ return true;
+ }
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+ return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+ // Avoid a move if there is lots of overlapping grandparent data.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ // If start_level_== output_level_, the purpose is to force compaction
+ // filter to be applied to that level, and thus cannot be a trivial move.
+
+ // Check if start level have files with overlapping ranges
+ if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) {
+ // We cannot move files from L0 to L1 if the files are overlapping
+ return false;
+ }
+
+ if (is_manual_compaction_ &&
+ (immutable_cf_options_.compaction_filter != nullptr ||
+ immutable_cf_options_.compaction_filter_factory != nullptr)) {
+ // This is a manual compaction and we have a compaction filter that should
+ // be executed, we cannot do a trivial move
+ return false;
+ }
+
+ // Used in universal compaction, where trivial move can be done if the
+ // input files are non overlapping
+ if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+ (output_level_ != 0)) {
+ return is_trivial_move_;
+ }
+
+ if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+ input(0, 0)->fd.GetPathId() == output_path_id() &&
+ InputCompressionMatchesOutput())) {
+ return false;
+ }
+
+ // assert inputs_.size() == 1
+
+ for (const auto& file : inputs_.front().files) {
+ std::vector<FileMetaData*> file_grand_parents;
+ if (output_level_ + 1 >= number_levels_) {
+ continue;
+ }
+ input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+ &file->largest, &file_grand_parents);
+ const auto compaction_size =
+ file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+ if (compaction_size > max_compaction_bytes_) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ for (size_t i = 0; i < inputs_[which].size(); i++) {
+ out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+ }
+ }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+ assert(input_version_ != nullptr);
+ assert(level_ptrs != nullptr);
+ assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+ if (bottommost_level_) {
+ return true;
+ } else if (output_level_ != 0 &&
+ cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = cfd_->user_comparator();
+ for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+ const std::vector<FileMetaData*>& files =
+ input_vstorage_->LevelFiles(lvl);
+ for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+ auto* f = files[level_ptrs->at(lvl)];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so it may
+ // exist beyond output level
+ return false;
+ }
+ break;
+ }
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+ for (size_t i = 0; i < num_input_levels(); i++) {
+ for (size_t j = 0; j < inputs_[i].size(); j++) {
+ assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+ : inputs_[i][j]->being_compacted);
+ inputs_[i][j]->being_compacted = mark_as_compacted;
+ }
+ }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+ InputLevelSummaryBuffer* scratch) const {
+ int len = 0;
+ bool is_first = true;
+ for (auto& input_level : inputs_) {
+ if (input_level.empty()) {
+ continue;
+ }
+ if (!is_first) {
+ len +=
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ } else {
+ is_first = false;
+ }
+ len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+ input_level.level);
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ }
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ " files to L%d", output_level());
+
+ return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+ uint64_t size = 0;
+ for (auto& input_level : inputs_) {
+ for (auto f : input_level.files) {
+ size += f->fd.GetFileSize();
+ }
+ }
+ return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+ MarkFilesBeingCompacted(false);
+ cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+ assert(input_version_ != nullptr);
+ input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+ int len) {
+ *output = '\0';
+ int write = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ int sz = len - write;
+ int ret;
+ char sztxt[16];
+ AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+ ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+ files.at(i)->fd.GetNumber(), sztxt);
+ if (ret < 0 || ret >= sz) break;
+ write += ret;
+ }
+ // if files.size() is non-zero, overwrite the last space
+ return write - !!files.size();
+}
+} // namespace
+
+void Compaction::Summary(char* output, int len) {
+ int write =
+ snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+ input_version_->GetVersionNumber(), start_level_);
+ if (write < 0 || write >= len) {
+ return;
+ }
+
+ for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+ if (level_iter > 0) {
+ write += snprintf(output + write, len - write, "], [");
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+ write +=
+ InputSummary(inputs_[level_iter].files, output + write, len - write);
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+
+ snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+ uint64_t preallocation_size = 0;
+
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ preallocation_size += file->fd.GetFileSize();
+ }
+ }
+
+ if (max_output_file_size_ != port::kMaxUint64 &&
+ (immutable_cf_options_.compaction_style == kCompactionStyleLevel ||
+ output_level() > 0)) {
+ preallocation_size = std::min(max_output_file_size_, preallocation_size);
+ }
+
+ // Over-estimate slightly so we don't end up just barely crossing
+ // the threshold
+ // No point to prellocate more than 1GB.
+ return std::min(uint64_t{1073741824},
+ preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+ if (!cfd_->ioptions()->compaction_filter_factory) {
+ return nullptr;
+ }
+
+ CompactionFilter::Context context;
+ context.is_full_compaction = is_full_compaction_;
+ context.is_manual_compaction = is_manual_compaction_;
+ context.column_family_id = cfd_->GetID();
+ return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+ context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+ return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+ if (max_subcompactions_ <= 1 || cfd_ == nullptr) {
+ return false;
+ }
+ if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 &&
+ !IsOutputLevelEmpty();
+ } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+ return number_levels_ > 1 && output_level_ > 0;
+ } else {
+ return false;
+ }
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime() const {
+ uint64_t min_oldest_ancester_time = port::kMaxUint64;
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+ if (oldest_ancester_time != 0) {
+ min_oldest_ancester_time =
+ std::min(min_oldest_ancester_time, oldest_ancester_time);
+ }
+ }
+ }
+ return min_oldest_ancester_time;
+}
+
+int Compaction::GetInputBaseLevel() const {
+ return input_vstorage_->base_level();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..9358e50ff
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,384 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+ const InternalKey* smallest = nullptr;
+ const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+ int level;
+ std::vector<FileMetaData*> files;
+ std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+ inline bool empty() const { return files.empty(); }
+ inline size_t size() const { return files.size(); }
+ inline void clear() { files.clear(); }
+ inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+ Compaction(VersionStorageInfo* input_version,
+ const ImmutableCFOptions& immutable_cf_options,
+ const MutableCFOptions& mutable_cf_options,
+ std::vector<CompactionInputFiles> inputs, int output_level,
+ uint64_t target_file_size, uint64_t max_compaction_bytes,
+ uint32_t output_path_id, CompressionType compression,
+ CompressionOptions compression_opts, uint32_t max_subcompactions,
+ std::vector<FileMetaData*> grandparents,
+ bool manual_compaction = false, double score = -1,
+ bool deletion_compaction = false,
+ CompactionReason compaction_reason = CompactionReason::kUnknown);
+
+ // No copying allowed
+ Compaction(const Compaction&) = delete;
+ void operator=(const Compaction&) = delete;
+
+ ~Compaction();
+
+ // Returns the level associated to the specified compaction input level.
+ // If compaction_input_level is not specified, then input_level is set to 0.
+ int level(size_t compaction_input_level = 0) const {
+ return inputs_[compaction_input_level].level;
+ }
+
+ int start_level() const { return start_level_; }
+
+ // Outputs will go to this level
+ int output_level() const { return output_level_; }
+
+ // Returns the number of input levels in this compaction.
+ size_t num_input_levels() const { return inputs_.size(); }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // Returns the number of input files associated to the specified
+ // compaction input level.
+ // The function will return 0 if when "compaction_input_level" < 0
+ // or "compaction_input_level" >= "num_input_levels()".
+ size_t num_input_files(size_t compaction_input_level) const {
+ if (compaction_input_level < inputs_.size()) {
+ return inputs_[compaction_input_level].size();
+ }
+ return 0;
+ }
+
+ // Returns input version of the compaction
+ Version* input_version() const { return input_version_; }
+
+ // Returns the ColumnFamilyData associated with the compaction.
+ ColumnFamilyData* column_family_data() const { return cfd_; }
+
+ // Returns the file meta data of the 'i'th input file at the
+ // specified compaction input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ FileMetaData* input(size_t compaction_input_level, size_t i) const {
+ assert(compaction_input_level < inputs_.size());
+ return inputs_[compaction_input_level][i];
+ }
+
+ const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+ }
+
+ // Returns the list of file meta data of the specified compaction
+ // input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ const std::vector<FileMetaData*>* inputs(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].files;
+ }
+
+ const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+ // Returns the LevelFilesBrief of the specified compaction input level.
+ const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+ return &input_levels_[compaction_input_level];
+ }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+ // What compression for output
+ CompressionType output_compression() const { return output_compression_; }
+
+ // What compression options for output
+ CompressionOptions output_compression_opts() const {
+ return output_compression_opts_;
+ }
+
+ // Whether need to write output file to second DB path.
+ uint32_t output_path_id() const { return output_path_id_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // If true, then the compaction can be done by simply deleting input files.
+ bool deletion_compaction() const { return deletion_compaction_; }
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the available information we have guarantees that
+ // the input "user_key" does not exist in any level beyond "output_level()".
+ bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+ std::vector<size_t>* level_ptrs) const;
+
+ // Clear all files to indicate that they are not being compacted
+ // Delete this compaction from the list of running compactions.
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Status status);
+
+ // Returns the summary of the compaction in "output" with maximum "len"
+ // in bytes. The caller is responsible for the memory management of
+ // "output".
+ void Summary(char* output, int len);
+
+ // Return the score that was used to pick this compaction run.
+ double score() const { return score_; }
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level() const { return bottommost_level_; }
+
+ // Does this compaction include all sst files?
+ bool is_full_compaction() const { return is_full_compaction_; }
+
+ // Was this compaction triggered manually by the client?
+ bool is_manual_compaction() const { return is_manual_compaction_; }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. If all the input files are
+ // non overlapping, then is_trivial_move_ variable
+ // will be set true, else false
+ void set_is_trivial_move(bool trivial_move) {
+ is_trivial_move_ = trivial_move;
+ }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. Returns true, if the input files
+ // are non-overlapping and can be trivially moved.
+ bool is_trivial_move() const { return is_trivial_move_; }
+
+ // How many total levels are there?
+ int number_levels() const { return number_levels_; }
+
+ // Return the ImmutableCFOptions that should be used throughout the compaction
+ // procedure
+ const ImmutableCFOptions* immutable_cf_options() const {
+ return &immutable_cf_options_;
+ }
+
+ // Return the MutableCFOptions that should be used throughout the compaction
+ // procedure
+ const MutableCFOptions* mutable_cf_options() const {
+ return &mutable_cf_options_;
+ }
+
+ // Returns the size in bytes that the output file should be preallocated to.
+ // In level compaction, that is max_file_size_. In universal compaction, that
+ // is the sum of all input file sizes.
+ uint64_t OutputFilePreallocationSize() const;
+
+ void SetInputVersion(Version* input_version);
+
+ struct InputLevelSummaryBuffer {
+ char buffer[128];
+ };
+
+ const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+ uint64_t CalculateTotalInputSize() const;
+
+ // In case of compaction error, reset the nextIndex that is used
+ // to pick up the next file to be compacted from files_by_size_
+ void ResetNextCompactionIndex();
+
+ // Create a CompactionFilter from compaction_filter_factory
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+ // Is the input level corresponding to output_level_ empty?
+ bool IsOutputLevelEmpty() const;
+
+ // Should this compaction be broken up into smaller ones run in parallel?
+ bool ShouldFormSubcompactions() const;
+
+ // test function to validate the functionality of IsBottommostLevel()
+ // function -- determines if compaction with inputs and storage is bottommost
+ static bool TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ TablePropertiesCollection GetOutputTableProperties() const {
+ return output_table_properties_;
+ }
+
+ void SetOutputTableProperties(TablePropertiesCollection tp) {
+ output_table_properties_ = std::move(tp);
+ }
+
+ Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+ Slice GetLargestUserKey() const { return largest_user_key_; }
+
+ int GetInputBaseLevel() const;
+
+ CompactionReason compaction_reason() { return compaction_reason_; }
+
+ const std::vector<FileMetaData*>& grandparents() const {
+ return grandparents_;
+ }
+
+ uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+ uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+ uint64_t MinInputFileOldestAncesterTime() const;
+
+ private:
+ // mark (or clear) all files that are being compacted
+ void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+ // get the smallest and largest key present in files to be compacted
+ static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs,
+ Slice* smallest_key, Slice* largest_key);
+
+ // Get the atomic file boundaries for all files in the compaction. Necessary
+ // in order to avoid the scenario described in
+ // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
+ // down appropriate key boundaries to RangeDelAggregator during compaction.
+ static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+ // helper function to determine if compaction with inputs and storage is
+ // bottommost
+ static bool IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ static bool IsFullCompaction(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ VersionStorageInfo* input_vstorage_;
+
+ const int start_level_; // the lowest level to be compacted
+ const int output_level_; // levels to which output files are stored
+ uint64_t max_output_file_size_;
+ uint64_t max_compaction_bytes_;
+ uint32_t max_subcompactions_;
+ const ImmutableCFOptions immutable_cf_options_;
+ const MutableCFOptions mutable_cf_options_;
+ Version* input_version_;
+ VersionEdit edit_;
+ const int number_levels_;
+ ColumnFamilyData* cfd_;
+ Arena arena_; // Arena used to allocate space for file_levels_
+
+ const uint32_t output_path_id_;
+ CompressionType output_compression_;
+ CompressionOptions output_compression_opts_;
+ // If true, then the comaction can be done by simply deleting input files.
+ const bool deletion_compaction_;
+
+ // Compaction input files organized by level. Constant after construction
+ const std::vector<CompactionInputFiles> inputs_;
+
+ // A copy of inputs_, organized more closely in memory
+ autovector<LevelFilesBrief, 2> input_levels_;
+
+ // State used to check for number of overlapping grandparent files
+ // (grandparent == "output_level_ + 1")
+ std::vector<FileMetaData*> grandparents_;
+ const double score_; // score that was used to pick this compaction.
+
+ // Is this compaction creating a file in the bottom most level?
+ const bool bottommost_level_;
+ // Does this compaction include all sst files?
+ const bool is_full_compaction_;
+
+ // Is this compaction requested by the client?
+ const bool is_manual_compaction_;
+
+ // True if we can do trivial move in Universal multi level
+ // compaction
+ bool is_trivial_move_;
+
+ // Does input compression match the output compression?
+ bool InputCompressionMatchesOutput() const;
+
+ // table properties of output files
+ TablePropertiesCollection output_table_properties_;
+
+ // smallest user keys in compaction
+ Slice smallest_user_key_;
+
+ // largest user keys in compaction
+ Slice largest_user_key_;
+
+ // Reason for compaction
+ CompactionReason compaction_reason_;
+};
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..963c1d8eb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+struct CompactionIterationStats {
+ // Compaction statistics
+
+ // Doesn't include records skipped because of
+ // CompactionFilter::Decision::kRemoveAndSkipUntil.
+ int64_t num_record_drop_user = 0;
+
+ int64_t num_record_drop_hidden = 0;
+ int64_t num_record_drop_obsolete = 0;
+ int64_t num_record_drop_range_del = 0;
+ int64_t num_range_del_drop_obsolete = 0;
+ // Deletions obsoleted before bottom level due to file gap optimization.
+ int64_t num_optimized_del_drop_obsolete = 0;
+ uint64_t total_filter_time = 0;
+
+ // Input statistics
+ // TODO(noetzli): The stats are incomplete. They are lacking everything
+ // consumed by MergeHelper.
+ uint64_t num_input_records = 0;
+ uint64_t num_input_deletion_records = 0;
+ uint64_t num_input_corrupt_records = 0;
+ uint64_t total_input_raw_key_bytes = 0;
+ uint64_t total_input_raw_value_bytes = 0;
+
+ // Single-Delete diagnostics for exceptional situations
+ uint64_t num_single_del_fallthru = 0;
+ uint64_t num_single_del_mismatch = 0;
+};
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..1bebfc717
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,774 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <cinttypes>
+
+#include "db/compaction/compaction_iterator.h"
+#include "db/snapshot_checker.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+#define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \
+ ((seq) <= (snapshot) && \
+ (snapshot_checker_ == nullptr || \
+ LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+ SnapshotCheckerResult::kInSnapshot)))
+
+#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot) \
+ ((seq) > (snapshot) || \
+ (snapshot_checker_ != nullptr && \
+ UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+ SnapshotCheckerResult::kNotInSnapshot)))
+
+#define IN_EARLIEST_SNAPSHOT(seq) \
+ ((seq) <= earliest_snapshot_ && \
+ (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))
+
+namespace ROCKSDB_NAMESPACE {
+
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
+ const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum,
+ const std::atomic<bool>* manual_compaction_paused,
+ const std::shared_ptr<Logger> info_log)
+ : CompactionIterator(
+ input, cmp, merge_helper, last_sequence, snapshots,
+ earliest_write_conflict_snapshot, snapshot_checker, env,
+ report_detailed_time, expect_valid_internal_key, range_del_agg,
+ std::unique_ptr<CompactionProxy>(
+ compaction ? new CompactionProxy(compaction) : nullptr),
+ compaction_filter, shutting_down, preserve_deletes_seqnum,
+ manual_compaction_paused, info_log) {}
+
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum,
+ const std::atomic<bool>* manual_compaction_paused,
+ const std::shared_ptr<Logger> info_log)
+ : input_(input),
+ cmp_(cmp),
+ merge_helper_(merge_helper),
+ snapshots_(snapshots),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ env_(env),
+ report_detailed_time_(report_detailed_time),
+ expect_valid_internal_key_(expect_valid_internal_key),
+ range_del_agg_(range_del_agg),
+ compaction_(std::move(compaction)),
+ compaction_filter_(compaction_filter),
+ shutting_down_(shutting_down),
+ manual_compaction_paused_(manual_compaction_paused),
+ preserve_deletes_seqnum_(preserve_deletes_seqnum),
+ current_user_key_sequence_(0),
+ current_user_key_snapshot_(0),
+ merge_out_iter_(merge_helper_),
+ current_key_committed_(false),
+ info_log_(info_log) {
+ assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+ assert(snapshots_ != nullptr);
+ bottommost_level_ =
+ compaction_ == nullptr ? false : compaction_->bottommost_level();
+ if (compaction_ != nullptr) {
+ level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+ }
+ if (snapshots_->size() == 0) {
+ // optimize for fast path if there are no snapshots
+ visible_at_tip_ = true;
+ earliest_snapshot_iter_ = snapshots_->end();
+ earliest_snapshot_ = kMaxSequenceNumber;
+ latest_snapshot_ = 0;
+ } else {
+ visible_at_tip_ = false;
+ earliest_snapshot_iter_ = snapshots_->begin();
+ earliest_snapshot_ = snapshots_->at(0);
+ latest_snapshot_ = snapshots_->back();
+ }
+#ifndef NDEBUG
+ // findEarliestVisibleSnapshot assumes this ordering.
+ for (size_t i = 1; i < snapshots_->size(); ++i) {
+ assert(snapshots_->at(i - 1) < snapshots_->at(i));
+ }
+#endif
+ input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+ // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
+ input_->SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+ iter_stats_.num_record_drop_user = 0;
+ iter_stats_.num_record_drop_hidden = 0;
+ iter_stats_.num_record_drop_obsolete = 0;
+ iter_stats_.num_record_drop_range_del = 0;
+ iter_stats_.num_range_del_drop_obsolete = 0;
+ iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+ NextFromInput();
+ PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+ // If there is a merge output, return it before continuing to process the
+ // input.
+ if (merge_out_iter_.Valid()) {
+ merge_out_iter_.Next();
+
+ // Check if we returned all records of the merge output.
+ if (merge_out_iter_.Valid()) {
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ bool valid_key __attribute__((__unused__));
+ valid_key = ParseInternalKey(key_, &ikey_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to be valid.
+ assert(valid_key);
+ if (!valid_key) {
+ ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+ key_.ToString(true).c_str());
+ }
+
+ // Keep current_key_ in sync.
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ valid_ = true;
+ } else {
+ // We consumed all pinned merge operands, release pinned iterators
+ pinned_iters_mgr_.ReleasePinnedData();
+ // MergeHelper moves the iterator to the first record after the merged
+ // records, so even though we reached the end of the merge output, we do
+ // not want to advance the iterator.
+ NextFromInput();
+ }
+ } else {
+ // Only advance the input iterator if there is no merge output and the
+ // iterator is not already at the next record.
+ if (!at_next_) {
+ input_->Next();
+ }
+ NextFromInput();
+ }
+
+ if (valid_) {
+ // Record that we've outputted a record for the current key.
+ has_outputted_key_ = true;
+ }
+
+ PrepareOutput();
+}
+
+void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+ Slice* skip_until) {
+ if (compaction_filter_ != nullptr &&
+ (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
+ // If the user has specified a compaction filter and the sequence
+ // number is greater than any external snapshot, then invoke the
+ // filter. If the return value of the compaction filter is true,
+ // replace the entry with a deletion marker.
+ CompactionFilter::Decision filter;
+ compaction_filter_value_.clear();
+ compaction_filter_skip_until_.Clear();
+ CompactionFilter::ValueType value_type =
+ ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+ : CompactionFilter::ValueType::kBlobIndex;
+ // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+ // to get sequence number.
+ Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
+ {
+ StopWatchNano timer(env_, report_detailed_time_);
+ filter = compaction_filter_->FilterV2(
+ compaction_->level(), filter_key, value_type, value_,
+ &compaction_filter_value_, compaction_filter_skip_until_.rep());
+ iter_stats_.total_filter_time +=
+ env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+ cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+ 0) {
+ // Can't skip to a key smaller than the current one.
+ // Keep the key as per FilterV2 documentation.
+ filter = CompactionFilter::Decision::kKeep;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemove) {
+ // convert the current key to a delete; key_ is pointing into
+ // current_key_ at this point, so updating current_key_ updates key()
+ ikey_.type = kTypeDeletion;
+ current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+ // no value associated with delete
+ value_.clear();
+ iter_stats_.num_record_drop_user++;
+ } else if (filter == CompactionFilter::Decision::kChangeValue) {
+ value_ = compaction_filter_value_;
+ } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ *need_skip = true;
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ *skip_until = compaction_filter_skip_until_.Encode();
+ }
+ }
+}
+
+void CompactionIterator::NextFromInput() {
+ at_next_ = false;
+ valid_ = false;
+
+ while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+ !IsShuttingDown()) {
+ key_ = input_->key();
+ value_ = input_->value();
+ iter_stats_.num_input_records++;
+
+ if (!ParseInternalKey(key_, &ikey_)) {
+ // If `expect_valid_internal_key_` is false, return the corrupted key
+ // and let the caller decide what to do with it.
+ // TODO(noetzli): We should have a more elegant solution for this.
+ if (expect_valid_internal_key_) {
+ assert(!"Corrupted internal key not expected.");
+ status_ = Status::Corruption("Corrupted internal key not expected.");
+ break;
+ }
+ key_ = current_key_.SetInternalKey(key_);
+ has_current_user_key_ = false;
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ iter_stats_.num_input_corrupt_records++;
+ valid_ = true;
+ break;
+ }
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+ // Update input statistics
+ if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+ iter_stats_.num_input_deletion_records++;
+ }
+ iter_stats_.total_input_raw_key_bytes += key_.size();
+ iter_stats_.total_input_raw_value_bytes += value_.size();
+
+ // If need_skip is true, we should seek the input iterator
+ // to internal key skip_until and continue from there.
+ bool need_skip = false;
+ // Points either into compaction_filter_skip_until_ or into
+ // merge_helper_->compaction_filter_skip_until_.
+ Slice skip_until;
+
+ // Check whether the user key changed. After this if statement current_key_
+ // is a copy of the current input key (maybe converted to a delete by the
+ // compaction filter). ikey_.user_key is pointing to the copy.
+ if (!has_current_user_key_ ||
+ !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+ // First occurrence of this user key
+ // Copy key for output
+ key_ = current_key_.SetInternalKey(key_, &ikey_);
+ current_user_key_ = ikey_.user_key;
+ has_current_user_key_ = true;
+ has_outputted_key_ = false;
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+ // Apply the compaction filter to the first committed version of the user
+ // key.
+ if (current_key_committed_) {
+ InvokeFilterIfNeeded(&need_skip, &skip_until);
+ }
+ } else {
+ // Update the current key to reflect the new sequence number/type without
+ // copying the user key.
+ // TODO(rven): Compaction filter does not process keys in this path
+ // Need to have the compaction filter process multiple versions
+ // if we have versions on both sides of a snapshot
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+
+ // Note that newer version of a key is ordered before older versions. If a
+ // newer version of a key is committed, so as the older version. No need
+ // to query snapshot_checker_ in that case.
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+ // Apply the compaction filter to the first committed version of the
+ // user key.
+ if (current_key_committed_) {
+ InvokeFilterIfNeeded(&need_skip, &skip_until);
+ }
+ }
+ }
+
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ valid_ = true;
+ break;
+ }
+
+ // If there are no snapshots, then this kv affect visibility at tip.
+ // Otherwise, search though all existing snapshots to find the earliest
+ // snapshot that is affected by this kv.
+ SequenceNumber last_sequence __attribute__((__unused__));
+ last_sequence = current_user_key_sequence_;
+ current_user_key_sequence_ = ikey_.sequence;
+ SequenceNumber last_snapshot = current_user_key_snapshot_;
+ SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
+ current_user_key_snapshot_ =
+ visible_at_tip_
+ ? earliest_snapshot_
+ : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+ if (need_skip) {
+ // This case is handled below.
+ } else if (clear_and_output_next_key_) {
+ // In the previous iteration we encountered a single delete that we could
+ // not compact out. We will keep this Put, but can drop it's data.
+ // (See Optimization 3, below.)
+ assert(ikey_.type == kTypeValue);
+ if (ikey_.type != kTypeValue) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Unexpected key type %d for compaction output",
+ ikey_.type);
+ }
+ assert(current_user_key_snapshot_ == last_snapshot);
+ if (current_user_key_snapshot_ != last_snapshot) {
+ ROCKS_LOG_FATAL(info_log_,
+ "current_user_key_snapshot_ (%" PRIu64
+ ") != last_snapshot (%" PRIu64 ")",
+ current_user_key_snapshot_, last_snapshot);
+ }
+
+ value_.clear();
+ valid_ = true;
+ clear_and_output_next_key_ = false;
+ } else if (ikey_.type == kTypeSingleDeletion) {
+ // We can compact out a SingleDelete if:
+ // 1) We encounter the corresponding PUT -OR- we know that this key
+ // doesn't appear past this output level
+ // =AND=
+ // 2) We've already returned a record in this snapshot -OR-
+ // there are no earlier earliest_write_conflict_snapshot.
+ //
+ // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to
+ // allow Transactions to do write-conflict checking (if we compacted away
+ // all keys, then we wouldn't know that a write happened in this
+ // snapshot). If there is no earlier snapshot, then we know that there
+ // are no active transactions that need to know about any writes.
+ //
+ // Optimization 3:
+ // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+ // true, then we must output a SingleDelete. In this case, we will decide
+ // to also output the PUT. While we are compacting less by outputting the
+ // PUT now, hopefully this will lead to better compaction in the future
+ // when Rule 2 is later true (Ie, We are hoping we can later compact out
+ // both the SingleDelete and the Put, while we couldn't if we only
+ // outputted the SingleDelete now).
+ // In this case, we can save space by removing the PUT's value as it will
+ // never be read.
+ //
+ // Deletes and Merges are not supported on the same key that has a
+ // SingleDelete as it is not possible to correctly do any partial
+ // compaction of such a combination of operations. The result of mixing
+ // those operations for a given key is documented as being undefined. So
+ // we can choose how to handle such a combinations of operations. We will
+ // try to compact out as much as we can in these cases.
+ // We will report counts on these anomalous cases.
+
+ // The easiest way to process a SingleDelete during iteration is to peek
+ // ahead at the next key.
+ ParsedInternalKey next_ikey;
+ input_->Next();
+
+ // Check whether the next key exists, is not corrupt, and is the same key
+ // as the single delete.
+ if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+ cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+ // Check whether the next key belongs to the same snapshot as the
+ // SingleDelete.
+ if (prev_snapshot == 0 ||
+ DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) {
+ if (next_ikey.type == kTypeSingleDeletion) {
+ // We encountered two SingleDeletes in a row. This could be due to
+ // unexpected user input.
+ // Skip the first SingleDelete and let the next iteration decide how
+ // to handle the second SingleDelete
+
+ // First SingleDelete has been skipped since we already called
+ // input_->Next().
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_mismatch;
+ } else if (has_outputted_key_ ||
+ DEFINITELY_IN_SNAPSHOT(
+ ikey_.sequence, earliest_write_conflict_snapshot_)) {
+ // Found a matching value, we can drop the single delete and the
+ // value. It is safe to drop both records since we've already
+ // outputted a key in this snapshot, or there is no earlier
+ // snapshot (Rule 2 above).
+
+ // Note: it doesn't matter whether the second key is a Put or if it
+ // is an unexpected Merge or Delete. We will compact it out
+ // either way. We will maintain counts of how many mismatches
+ // happened
+ if (next_ikey.type != kTypeValue &&
+ next_ikey.type != kTypeBlobIndex) {
+ ++iter_stats_.num_single_del_mismatch;
+ }
+
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ // Already called input_->Next() once. Call it a second time to
+ // skip past the second key.
+ input_->Next();
+ } else {
+ // Found a matching value, but we cannot drop both keys since
+ // there is an earlier snapshot and we need to leave behind a record
+ // to know that a write happened in this snapshot (Rule 2 above).
+ // Clear the value and output the SingleDelete. (The value will be
+ // outputted on the next iteration.)
+
+ // Setting valid_ to true will output the current SingleDelete
+ valid_ = true;
+
+ // Set up the Put to be outputted in the next iteration.
+ // (Optimization 3).
+ clear_and_output_next_key_ = true;
+ }
+ } else {
+ // We hit the next snapshot without hitting a put, so the iterator
+ // returns the single delete.
+ valid_ = true;
+ }
+ } else {
+ // We are at the end of the input, could not parse the next key, or hit
+ // a different key. The iterator returns the single delete if the key
+ // possibly exists beyond the current output level. We set
+ // has_current_user_key to false so that if the iterator is at the next
+ // key, we do not compare it again against the previous key at the next
+ // iteration. If the next key is corrupt, we return before the
+ // comparison, so the value of has_current_user_key does not matter.
+ has_current_user_key_ = false;
+ if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_)) {
+ // Key doesn't exist outside of this range.
+ // Can compact out this SingleDelete.
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_fallthru;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ } else {
+ // Output SingleDelete
+ valid_ = true;
+ }
+ }
+
+ if (valid_) {
+ at_next_ = true;
+ }
+ } else if (last_snapshot == current_user_key_snapshot_ ||
+ (last_snapshot > 0 &&
+ last_snapshot < current_user_key_snapshot_)) {
+ // If the earliest snapshot is which this key is visible in
+ // is the same as the visibility of a previous instance of the
+ // same key, then this kv is not visible in any snapshot.
+ // Hidden by an newer entry for same user key
+ //
+ // Note: Dropping this key will not affect TransactionDB write-conflict
+ // checking since there has already been a record returned for this key
+ // in this snapshot.
+ assert(last_sequence >= current_user_key_sequence_);
+ if (last_sequence < current_user_key_sequence_) {
+ ROCKS_LOG_FATAL(info_log_,
+ "last_sequence (%" PRIu64
+ ") < current_user_key_sequence_ (%" PRIu64 ")",
+ last_sequence, current_user_key_sequence_);
+ }
+
+ ++iter_stats_.num_record_drop_hidden; // (A)
+ input_->Next();
+ } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
+ IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+ ikeyNotNeededForIncrementalSnapshot() &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_)) {
+ // TODO(noetzli): This is the only place where we use compaction_
+ // (besides the constructor). We should probably get rid of this
+ // dependency and find a way to do similar filtering during flushes.
+ //
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ //
+ // Note: Dropping this Delete will not affect TransactionDB
+ // write-conflict checking since it is earlier than any snapshot.
+ //
+ // It seems that we can also drop deletion later than earliest snapshot
+ // given that:
+ // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+ // (2) No value exist earlier than the deletion.
+ ++iter_stats_.num_record_drop_obsolete;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ input_->Next();
+ } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
+ ikeyNotNeededForIncrementalSnapshot()) {
+ // Handle the case where we have a delete key at the bottom most level
+ // We can skip outputting the key iff there are no subsequent puts for this
+ // key
+ ParsedInternalKey next_ikey;
+ input_->Next();
+ // Skip over all versions of this key that happen to occur in the same snapshot
+ // range as the delete
+ while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+ cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+ (prev_snapshot == 0 ||
+ DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
+ input_->Next();
+ }
+ // If you find you still need to output a row with this key, we need to output the
+ // delete too
+ if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+ cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+ valid_ = true;
+ at_next_ = true;
+ }
+ } else if (ikey_.type == kTypeMerge) {
+ if (!merge_helper_->HasOperator()) {
+ status_ = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ return;
+ }
+
+ pinned_iters_mgr_.StartPinning();
+ // We know the merge type entry is not hidden, otherwise we would
+ // have hit (A)
+ // We encapsulate the merge related state machine in a different
+ // object to minimize change to the existing flow.
+ Status s = merge_helper_->MergeUntil(input_, range_del_agg_,
+ prev_snapshot, bottommost_level_);
+ merge_out_iter_.SeekToFirst();
+
+ if (!s.ok() && !s.IsMergeInProgress()) {
+ status_ = s;
+ return;
+ } else if (merge_out_iter_.Valid()) {
+ // NOTE: key, value, and ikey_ refer to old entries.
+ // These will be correctly set below.
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ bool valid_key __attribute__((__unused__));
+ valid_key = ParseInternalKey(key_, &ikey_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to valid.
+ assert(valid_key);
+ if (!valid_key) {
+ ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+ key_.ToString(true).c_str());
+ }
+ // Keep current_key_ in sync.
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ valid_ = true;
+ } else {
+ // all merge operands were filtered out. reset the user key, since the
+ // batch consumed by the merge operator should not shadow any keys
+ // coming after the merges
+ has_current_user_key_ = false;
+ pinned_iters_mgr_.ReleasePinnedData();
+
+ if (merge_helper_->FilteredUntil(&skip_until)) {
+ need_skip = true;
+ }
+ }
+ } else {
+ // 1. new user key -OR-
+ // 2. different snapshot stripe
+ bool should_delete = range_del_agg_->ShouldDelete(
+ key_, RangeDelPositioningMode::kForwardTraversal);
+ if (should_delete) {
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_range_del;
+ input_->Next();
+ } else {
+ valid_ = true;
+ }
+ }
+
+ if (need_skip) {
+ input_->Seek(skip_until);
+ }
+ }
+
+ if (!valid_ && IsShuttingDown()) {
+ status_ = Status::ShutdownInProgress();
+ }
+
+ if (IsPausingManualCompaction()) {
+ status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+}
+
+void CompactionIterator::PrepareOutput() {
+ if (valid_) {
+ if (compaction_filter_ && ikey_.type == kTypeBlobIndex) {
+ const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+ user_key(), value_, &compaction_filter_value_);
+
+ if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+ status_ = Status::Corruption(
+ "Corrupted blob reference encountered during GC");
+ valid_ = false;
+ } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+ status_ = Status::IOError("Could not relocate blob during GC");
+ valid_ = false;
+ } else if (blob_decision ==
+ CompactionFilter::BlobDecision::kChangeValue) {
+ value_ = compaction_filter_value_;
+ }
+ }
+
+ // Zeroing out the sequence number leads to better compression.
+ // If this is the bottommost level (no files in lower levels)
+ // and the earliest snapshot is larger than this seqno
+ // and the userkey differs from the last userkey in compaction
+ // then we can squash the seqno to zero.
+ //
+ // This is safe for TransactionDB write-conflict checking since transactions
+ // only care about sequence number larger than any active snapshots.
+ //
+ // Can we do the same for levels above bottom level as long as
+ // KeyNotExistsBeyondOutputLevel() return true?
+ if (valid_ && compaction_ != nullptr &&
+ !compaction_->allow_ingest_behind() &&
+ ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ &&
+ IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
+ assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
+ if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Unexpected key type %d for seq-zero optimization",
+ ikey_.type);
+ }
+ ikey_.sequence = 0;
+ current_key_.UpdateInternalKey(0, ikey_.type);
+ }
+ }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot) {
+ assert(snapshots_->size());
+ if (snapshots_->size() == 0) {
+ ROCKS_LOG_FATAL(info_log_,
+ "No snapshot left in findEarliestVisibleSnapshot");
+ }
+ auto snapshots_iter = std::lower_bound(
+ snapshots_->begin(), snapshots_->end(), in);
+ if (snapshots_iter == snapshots_->begin()) {
+ *prev_snapshot = 0;
+ } else {
+ *prev_snapshot = *std::prev(snapshots_iter);
+ assert(*prev_snapshot < in);
+ if (*prev_snapshot >= in) {
+ ROCKS_LOG_FATAL(info_log_,
+ "*prev_snapshot >= in in findEarliestVisibleSnapshot");
+ }
+ }
+ if (snapshot_checker_ == nullptr) {
+ return snapshots_iter != snapshots_->end()
+ ? *snapshots_iter : kMaxSequenceNumber;
+ }
+ bool has_released_snapshot = !released_snapshots_.empty();
+ for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+ auto cur = *snapshots_iter;
+ assert(in <= cur);
+ if (in > cur) {
+ ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot");
+ }
+ // Skip if cur is in released_snapshots.
+ if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+ continue;
+ }
+ auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+ if (res == SnapshotCheckerResult::kInSnapshot) {
+ return cur;
+ } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+ released_snapshots_.insert(cur);
+ }
+ *prev_snapshot = cur;
+ }
+ return kMaxSequenceNumber;
+}
+
+// used in 2 places - prevents deletion markers to be dropped if they may be
+// needed and disables seqnum zero-out in PrepareOutput for recent keys.
+inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
+ return (!compaction_->preserve_deletes()) ||
+ (ikey_.sequence < preserve_deletes_seqnum_);
+}
+
+bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
+ assert(snapshot_checker_ != nullptr);
+ bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
+ (earliest_snapshot_iter_ != snapshots_->end() &&
+ *earliest_snapshot_iter_ == earliest_snapshot_));
+ assert(pre_condition);
+ if (!pre_condition) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Pre-Condition is not hold in IsInEarliestSnapshot");
+ }
+ auto in_snapshot =
+ snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+ while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
+ // Avoid the the current earliest_snapshot_ being return as
+ // earliest visible snapshot for the next value. So if a value's sequence
+ // is zero-ed out by PrepareOutput(), the next value will be compact out.
+ released_snapshots_.insert(earliest_snapshot_);
+ earliest_snapshot_iter_++;
+
+ if (earliest_snapshot_iter_ == snapshots_->end()) {
+ earliest_snapshot_ = kMaxSequenceNumber;
+ } else {
+ earliest_snapshot_ = *earliest_snapshot_iter_;
+ }
+ in_snapshot =
+ snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+ }
+ assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
+ if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
+ ROCKS_LOG_FATAL(info_log_,
+ "Unexpected released snapshot in IsInEarliestSnapshot");
+ }
+ return in_snapshot == SnapshotCheckerResult::kInSnapshot;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..8be60eb9e
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,240 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionIterator {
+ public:
+ // A wrapper around Compaction. Has a much smaller interface, only what
+ // CompactionIterator uses. Tests can override it.
+ class CompactionProxy {
+ public:
+ explicit CompactionProxy(const Compaction* compaction)
+ : compaction_(compaction) {}
+
+ virtual ~CompactionProxy() = default;
+ virtual int level(size_t /*compaction_input_level*/ = 0) const {
+ return compaction_->level();
+ }
+ virtual bool KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+ return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+ }
+ virtual bool bottommost_level() const {
+ return compaction_->bottommost_level();
+ }
+ virtual int number_levels() const { return compaction_->number_levels(); }
+ virtual Slice GetLargestUserKey() const {
+ return compaction_->GetLargestUserKey();
+ }
+ virtual bool allow_ingest_behind() const {
+ return compaction_->immutable_cf_options()->allow_ingest_behind;
+ }
+ virtual bool preserve_deletes() const {
+ return compaction_->immutable_cf_options()->preserve_deletes;
+ }
+
+ protected:
+ CompactionProxy() = default;
+
+ private:
+ const Compaction* compaction_;
+ };
+
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ const Compaction* compaction = nullptr,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const SequenceNumber preserve_deletes_seqnum = 0,
+ const std::atomic<bool>* manual_compaction_paused = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr);
+
+ // Constructor with custom CompactionProxy, used for tests.
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, Env* env,
+ bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const SequenceNumber preserve_deletes_seqnum = 0,
+ const std::atomic<bool>* manual_compaction_paused = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr);
+
+ ~CompactionIterator();
+
+ void ResetRecordCounts();
+
+ // Seek to the beginning of the compaction iterator output.
+ //
+ // REQUIRED: Call only once.
+ void SeekToFirst();
+
+ // Produces the next record in the compaction.
+ //
+ // REQUIRED: SeekToFirst() has been called.
+ void Next();
+
+ // Getters
+ const Slice& key() const { return key_; }
+ const Slice& value() const { return value_; }
+ const Status& status() const { return status_; }
+ const ParsedInternalKey& ikey() const { return ikey_; }
+ bool Valid() const { return valid_; }
+ const Slice& user_key() const { return current_user_key_; }
+ const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+
+ private:
+ // Processes the input stream to find the next output
+ void NextFromInput();
+
+ // Do last preparations before presenting the output to the callee. At this
+ // point this only zeroes out the sequence number if possible for better
+ // compression.
+ void PrepareOutput();
+
+ // Invoke compaction filter if needed.
+ void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+ // Given a sequence number, return the sequence number of the
+ // earliest snapshot that this sequence number is visible in.
+ // The snapshots themselves are arranged in ascending order of
+ // sequence numbers.
+ // Employ a sequential search because the total number of
+ // snapshots are typically small.
+ inline SequenceNumber findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot);
+
+ // Checks whether the currently seen ikey_ is needed for
+ // incremental (differential) snapshot and hence can't be dropped
+ // or seqnum be zero-ed out even if all other conditions for it are met.
+ inline bool ikeyNotNeededForIncrementalSnapshot();
+
+ inline bool KeyCommitted(SequenceNumber sequence) {
+ return snapshot_checker_ == nullptr ||
+ snapshot_checker_->CheckInSnapshot(sequence, kMaxSequenceNumber) ==
+ SnapshotCheckerResult::kInSnapshot;
+ }
+
+ bool IsInEarliestSnapshot(SequenceNumber sequence);
+
+ InternalIterator* input_;
+ const Comparator* cmp_;
+ MergeHelper* merge_helper_;
+ const std::vector<SequenceNumber>* snapshots_;
+ // List of snapshots released during compaction.
+ // findEarliestVisibleSnapshot() find them out from return of
+ // snapshot_checker, and make sure they will not be returned as
+ // earliest visible snapshot of an older value.
+ // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+ std::unordered_set<SequenceNumber> released_snapshots_;
+ std::vector<SequenceNumber>::const_iterator earliest_snapshot_iter_;
+ const SequenceNumber earliest_write_conflict_snapshot_;
+ const SnapshotChecker* const snapshot_checker_;
+ Env* env_;
+ bool report_detailed_time_;
+ bool expect_valid_internal_key_;
+ CompactionRangeDelAggregator* range_del_agg_;
+ std::unique_ptr<CompactionProxy> compaction_;
+ const CompactionFilter* compaction_filter_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>* manual_compaction_paused_;
+ const SequenceNumber preserve_deletes_seqnum_;
+ bool bottommost_level_;
+ bool valid_ = false;
+ bool visible_at_tip_;
+ SequenceNumber earliest_snapshot_;
+ SequenceNumber latest_snapshot_;
+
+ // State
+ //
+ // Points to a copy of the current compaction iterator output (current_key_)
+ // if valid_.
+ Slice key_;
+ // Points to the value in the underlying iterator that corresponds to the
+ // current output.
+ Slice value_;
+ // The status is OK unless compaction iterator encounters a merge operand
+ // while not having a merge operator defined.
+ Status status_;
+ // Stores the user key, sequence number and type of the current compaction
+ // iterator output (or current key in the underlying iterator during
+ // NextFromInput()).
+ ParsedInternalKey ikey_;
+ // Stores whether ikey_.user_key is valid. If set to false, the user key is
+ // not compared against the current key in the underlying iterator.
+ bool has_current_user_key_ = false;
+ bool at_next_ = false; // If false, the iterator
+ // Holds a copy of the current compaction iterator output (or current key in
+ // the underlying iterator during NextFromInput()).
+ IterKey current_key_;
+ Slice current_user_key_;
+ SequenceNumber current_user_key_sequence_;
+ SequenceNumber current_user_key_snapshot_;
+
+ // True if the iterator has already returned a record for the current key.
+ bool has_outputted_key_ = false;
+
+ // truncated the value of the next key and output it without applying any
+ // compaction rules. This is used for outputting a put after a single delete.
+ bool clear_and_output_next_key_ = false;
+
+ MergeOutputIterator merge_out_iter_;
+ // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+ // merge operands and then releasing them after consuming them.
+ PinnedIteratorsManager pinned_iters_mgr_;
+ std::string compaction_filter_value_;
+ InternalKey compaction_filter_skip_until_;
+ // "level_ptrs" holds indices that remember which file of an associated
+ // level we were last checking during the last call to compaction->
+ // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+ // to pick off where it left off since each subcompaction's key range is
+ // increasing so a later call to the function must be looking for a key that
+ // is in or beyond the last file checked during the previous call
+ std::vector<size_t> level_ptrs_;
+ CompactionIterationStats iter_stats_;
+
+ // Used to avoid purging uncommitted values. The application can specify
+ // uncommitted values by providing a SnapshotChecker object.
+ bool current_key_committed_;
+ std::shared_ptr<Logger> info_log_;
+
+ bool IsShuttingDown() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+ }
+
+ bool IsPausingManualCompaction() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return manual_compaction_paused_ &&
+ manual_compaction_paused_->load(std::memory_order_relaxed);
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..0c50fb9ba
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,976 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iterator.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* /*merge_out*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& /*operand_list*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ const char* Name() const override {
+ return "CompactionIteratorTest NoMergingMergeOp";
+ }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decition::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ int k = std::atoi(key.ToString().c_str());
+ last_seen.store(k);
+ while (k >= stall_at.load()) {
+ std::this_thread::yield();
+ }
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest StallingFilter";
+ }
+
+ // Wait until the filter sees a key >= k and stalls at that key.
+ // If `exact`, asserts that the seen key is equal to k.
+ void WaitForStall(int k, bool exact = true) {
+ stall_at.store(k);
+ while (last_seen.load() < k) {
+ std::this_thread::yield();
+ }
+ if (exact) {
+ EXPECT_EQ(k, last_seen.load());
+ }
+ }
+
+ // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+ mutable std::atomic<int> stall_at{0};
+ // Last key the filter was called with.
+ mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public InternalIterator {
+ public:
+ struct Action {
+ enum class Type {
+ SEEK_TO_FIRST,
+ SEEK,
+ NEXT,
+ };
+
+ Type type;
+ std::string arg;
+
+ explicit Action(Type _type, std::string _arg = "")
+ : type(_type), arg(_arg) {}
+
+ bool operator==(const Action& rhs) const {
+ return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+ }
+ };
+
+ LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values)
+ : keys_(keys), values_(values), current_(keys.size()) {
+ assert(keys_.size() == values_.size());
+ }
+
+ bool Valid() const override { return current_ < keys_.size(); }
+
+ void SeekToFirst() override {
+ log.emplace_back(Action::Type::SEEK_TO_FIRST);
+ current_ = 0;
+ }
+ void SeekToLast() override { assert(false); }
+
+ void Seek(const Slice& target) override {
+ log.emplace_back(Action::Type::SEEK, target.ToString());
+ current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+ keys_.begin();
+ }
+
+ void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+ void Next() override {
+ assert(Valid());
+ log.emplace_back(Action::Type::NEXT);
+ current_++;
+ }
+ void Prev() override { assert(false); }
+
+ Slice key() const override {
+ assert(Valid());
+ return Slice(keys_[current_]);
+ }
+ Slice value() const override {
+ assert(Valid());
+ return Slice(values_[current_]);
+ }
+
+ Status status() const override { return Status::OK(); }
+
+ std::vector<Action> log;
+
+ private:
+ std::vector<std::string> keys_;
+ std::vector<std::string> values_;
+ size_t current_;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+ FakeCompaction() = default;
+
+ int level(size_t /*compaction_input_level*/) const override { return 0; }
+ bool KeyNotExistsBeyondOutputLevel(
+ const Slice& /*user_key*/,
+ std::vector<size_t>* /*level_ptrs*/) const override {
+ return is_bottommost_level || key_not_exists_beyond_output_level;
+ }
+ bool bottommost_level() const override { return is_bottommost_level; }
+ int number_levels() const override { return 1; }
+ Slice GetLargestUserKey() const override {
+ return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ }
+ bool allow_ingest_behind() const override { return false; }
+
+ bool preserve_deletes() const override { return false; }
+
+ bool key_not_exists_beyond_output_level = false;
+
+ bool is_bottommost_level = false;
+};
+
+// A simplifed snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+ explicit TestSnapshotChecker(
+ SequenceNumber last_committed_sequence,
+ const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {{}})
+ : last_committed_sequence_(last_committed_sequence),
+ snapshots_(snapshots) {}
+
+ SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+ if (snapshot_seq == kMaxSequenceNumber) {
+ return seq <= last_committed_sequence_
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+ assert(snapshots_.count(snapshot_seq) > 0);
+ return seq <= snapshots_.at(snapshot_seq)
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+
+ private:
+ SequenceNumber last_committed_sequence_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+// bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+ CompactionIteratorTest()
+ : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+ void InitIterators(
+ const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+ const std::vector<std::string>& range_del_ks,
+ const std::vector<std::string>& range_del_vs,
+ SequenceNumber last_sequence,
+ SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+ MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+ std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+ new test::VectorIterator(range_del_ks, range_del_vs));
+ auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+ std::move(unfragmented_range_del_iter), icmp_);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+ kMaxSequenceNumber));
+ range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+ range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+ std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+ if (filter || bottommost_level) {
+ compaction_proxy_ = new FakeCompaction();
+ compaction_proxy_->is_bottommost_level = bottommost_level;
+ compaction.reset(compaction_proxy_);
+ }
+ bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+ if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+ snapshot_checker_.reset(
+ new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+ }
+ merge_helper_.reset(
+ new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+ 0 /*latest_snapshot*/, snapshot_checker_.get(),
+ 0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+ iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+ iter_->SeekToFirst();
+ c_iter_.reset(new CompactionIterator(
+ iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+ earliest_write_conflict_snapshot, snapshot_checker_.get(),
+ Env::Default(), false /* report_detailed_time */, false,
+ range_del_agg_.get(), std::move(compaction), filter, &shutting_down_));
+ }
+
+ void AddSnapshot(SequenceNumber snapshot,
+ SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+ snapshots_.push_back(snapshot);
+ snapshot_map_[snapshot] = last_visible_seq;
+ }
+
+ virtual bool UseSnapshotChecker() const { return false; }
+
+ void RunTest(
+ const std::vector<std::string>& input_keys,
+ const std::vector<std::string>& input_values,
+ const std::vector<std::string>& expected_keys,
+ const std::vector<std::string>& expected_values,
+ SequenceNumber last_committed_seq = kMaxSequenceNumber,
+ MergeOperator* merge_operator = nullptr,
+ CompactionFilter* compaction_filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+ InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+ last_committed_seq, merge_operator, compaction_filter,
+ bottommost_level, earliest_write_conflict_snapshot);
+ c_iter_->SeekToFirst();
+ for (size_t i = 0; i < expected_keys.size(); i++) {
+ std::string info = "i = " + ToString(i);
+ ASSERT_TRUE(c_iter_->Valid()) << info;
+ ASSERT_OK(c_iter_->status()) << info;
+ ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+ ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+ c_iter_->Next();
+ }
+ ASSERT_FALSE(c_iter_->Valid());
+ }
+
+ const Comparator* cmp_;
+ const InternalKeyComparator icmp_;
+ std::vector<SequenceNumber> snapshots_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+ std::unique_ptr<MergeHelper> merge_helper_;
+ std::unique_ptr<LoggingForwardVectorIterator> iter_;
+ std::unique_ptr<CompactionIterator> c_iter_;
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+ std::atomic<bool> shutting_down_{false};
+ FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue)},
+ {"", "val"}, {}, {}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue, true),
+ test::KeyStr("b", 10, kTypeValue)},
+ {"", "val", "val2"}, {}, {}, 10);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+ c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+ InitIterators({test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("morning", 2, kTypeValue),
+ test::KeyStr("night", 3, kTypeValue)},
+ {"zao", "zao", "wan"},
+ {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+ AddSnapshot(10);
+ std::vector<std::string> ks1;
+ ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+ std::vector<std::string> vs1{"mz"};
+ std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+ test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("night", 40, kTypeValue),
+ test::KeyStr("night", 20, kTypeValue)};
+ std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+ InitIterators(ks2, vs2, ks1, vs1, 40);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* skip_until) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("av50", v);
+ return Decision::kKeep;
+ }
+ if (k == "b") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("bv60", v);
+ *skip_until = "d+";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "e") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("em71", v);
+ return Decision::kKeep;
+ }
+ if (k == "f") {
+ if (v == "fm65") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "f";
+ } else {
+ EXPECT_EQ("fm30", v);
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "g+";
+ }
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "h") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("hv91", v);
+ return Decision::kKeep;
+ }
+ if (k == "i") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("im95", v);
+ *skip_until = "z";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+ }
+ };
+
+ NoMergingMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ {test::KeyStr("a", 50, kTypeValue), // keep
+ test::KeyStr("a", 45, kTypeMerge),
+ test::KeyStr("b", 60, kTypeValue), // skip to "d+"
+ test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+ test::KeyStr("d", 70, kTypeMerge),
+ test::KeyStr("e", 71, kTypeMerge), // keep
+ test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep
+ test::KeyStr("f", 30, kTypeMerge), // skip to "g+"
+ test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+ test::KeyStr("h", 91, kTypeValue), // keep
+ test::KeyStr("i", 95, kTypeMerge), // skip to "z"
+ test::KeyStr("j", 99, kTypeValue)},
+ {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+ "fv25", "gv90", "hv91", "im95", "jv99"},
+ {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+ // Compaction should output just "a", "e" and "h" keys.
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("av50", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("em71", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("hv91", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_FALSE(c_iter_->Valid());
+
+ // Check that the compaction iterator did the correct sequence of calls on
+ // the underlying iterator.
+ using A = LoggingForwardVectorIterator::Action;
+ using T = A::Type;
+ std::vector<A> expected_actions = {
+ A(T::SEEK_TO_FIRST),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+ ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+ test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ // Don't leave tombstones (kTypeDeletion) for filtered keys.
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ EXPECT_FALSE(c_iter_->Valid());
+ EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+ test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ ASSERT_FALSE(c_iter_->Valid());
+ ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("av1", v);
+ return Decision::kKeep;
+ } else if (k == "b") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ return Decision::kKeep;
+ } else if (k == "c") {
+ return Decision::kKeep;
+ }
+
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.SingleMergeOperand::Filter";
+ }
+ };
+
+ class SingleMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ // See InitIterators() call below for why "c" is the only key for which
+ // FullMergeV2 should be called.
+ EXPECT_EQ("c", merge_in.key.ToString());
+
+ std::string temp_value;
+ if (merge_in.existing_value != nullptr) {
+ temp_value = merge_in.existing_value->ToString();
+ }
+
+ for (auto& operand : merge_in.operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ merge_out->new_value = temp_value;
+
+ return true;
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ std::string string_key = key.ToString();
+ EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+ if (string_key == "a") {
+ EXPECT_EQ(1, operand_list.size());
+ } else if (string_key == "b") {
+ EXPECT_EQ(2, operand_list.size());
+ }
+
+ std::string temp_value;
+ for (auto& operand : operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ swap(temp_value, *new_value);
+
+ return true;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest SingleMergeOp";
+ }
+
+ bool AllowSingleOperand() const override { return true; }
+ };
+
+ SingleMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ // a should invoke PartialMergeMulti with a single merge operand.
+ {test::KeyStr("a", 50, kTypeMerge),
+ // b should invoke PartialMergeMulti with two operands.
+ test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+ // c should invoke FullMerge due to kTypeValue at the beginning.
+ test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+ {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+ kMaxSequenceNumber, &merge_op, &filter);
+
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("av1", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeDeletion),
+ test::KeyStr("b", 3, kTypeDeletion),
+ test::KeyStr("b", 1, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("b", 3, kTypeDeletion),
+ test::KeyStr("b", 0, kTypeValue)},
+ {"", ""},
+ kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion)},
+ {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+ kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+ testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+ : public CompactionIteratorTest {
+ public:
+ bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Value) {
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Deletion) {
+ RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Merge) {
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_SingleDelete) {
+ RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_BlobIndex) {
+ RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+ AddSnapshot(2, 1);
+ AddSnapshot(4, 3);
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ DedupSameSnapshot_SingleDeletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeSingleDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+ test::KeyStr("c", 3, kTypeDeletion)},
+ {"", "", ""},
+ {},
+ {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+ AddSnapshot(2,1);
+ RunTest(
+ {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", "", ""},
+ {test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ SingleDeleteAcrossSnapshotBoundary) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, false /*bottommost_level*/,
+ 2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+// * Convert the latest velue to deletion, and/or
+// * if latest value is a merge, apply filter to all suequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+ {"v2", "v1", "v3", "v4"},
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+ {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+ nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeDeletion),
+ test::KeyStr("a", 1, kTypeDeletion)},
+ {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ CompactionFilter_PartialMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeMerge)},
+ {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+ 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+ {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+ compaction_filter.get());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..576ec7b45
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,1700 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <functional>
+#include <list>
+#include <memory>
+#include <random>
+#include <set>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+ switch (compaction_reason) {
+ case CompactionReason::kUnknown:
+ return "Unknown";
+ case CompactionReason::kLevelL0FilesNum:
+ return "LevelL0FilesNum";
+ case CompactionReason::kLevelMaxLevelSize:
+ return "LevelMaxLevelSize";
+ case CompactionReason::kUniversalSizeAmplification:
+ return "UniversalSizeAmplification";
+ case CompactionReason::kUniversalSizeRatio:
+ return "UniversalSizeRatio";
+ case CompactionReason::kUniversalSortedRunNum:
+ return "UniversalSortedRunNum";
+ case CompactionReason::kFIFOMaxSize:
+ return "FIFOMaxSize";
+ case CompactionReason::kFIFOReduceNumFiles:
+ return "FIFOReduceNumFiles";
+ case CompactionReason::kFIFOTtl:
+ return "FIFOTtl";
+ case CompactionReason::kManualCompaction:
+ return "ManualCompaction";
+ case CompactionReason::kFilesMarkedForCompaction:
+ return "FilesMarkedForCompaction";
+ case CompactionReason::kBottommostFiles:
+ return "BottommostFiles";
+ case CompactionReason::kTtl:
+ return "Ttl";
+ case CompactionReason::kFlush:
+ return "Flush";
+ case CompactionReason::kExternalSstIngestion:
+ return "ExternalSstIngestion";
+ case CompactionReason::kPeriodicCompaction:
+ return "PeriodicCompaction";
+ case CompactionReason::kNumOfReasons:
+ // fall through
+ default:
+ assert(false);
+ return "Invalid";
+ }
+}
+
+// Maintains state for each sub-compaction
+struct CompactionJob::SubcompactionState {
+ const Compaction* compaction;
+ std::unique_ptr<CompactionIterator> c_iter;
+
+ // The boundaries of the key-range this compaction is interested in. No two
+ // subcompactions may have overlapping key-ranges.
+ // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+ Slice *start, *end;
+
+ // The return status of this subcompaction
+ Status status;
+
+ // Files produced by this subcompaction
+ struct Output {
+ FileMetaData meta;
+ bool finished;
+ std::shared_ptr<const TableProperties> table_properties;
+ };
+
+ // State kept for output being generated
+ std::vector<Output> outputs;
+ std::unique_ptr<WritableFileWriter> outfile;
+ std::unique_ptr<TableBuilder> builder;
+ Output* current_output() {
+ if (outputs.empty()) {
+ // This subcompaction's outptut could be empty if compaction was aborted
+ // before this subcompaction had a chance to generate any output files.
+ // When subcompactions are executed sequentially this is more likely and
+ // will be particulalry likely for the later subcompactions to be empty.
+ // Once they are run in parallel however it should be much rarer.
+ return nullptr;
+ } else {
+ return &outputs.back();
+ }
+ }
+
+ uint64_t current_output_file_size;
+
+ // State during the subcompaction
+ uint64_t total_bytes;
+ uint64_t num_output_records;
+ CompactionJobStats compaction_job_stats;
+ uint64_t approx_size;
+ // An index that used to speed up ShouldStopBefore().
+ size_t grandparent_index = 0;
+ // The number of bytes overlapping between the current output and
+ // grandparent files used in ShouldStopBefore().
+ uint64_t overlapped_bytes = 0;
+ // A flag determine whether the key has been seen in ShouldStopBefore()
+ bool seen_key = false;
+
+ SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
+ uint64_t size = 0)
+ : compaction(c),
+ start(_start),
+ end(_end),
+ outfile(nullptr),
+ builder(nullptr),
+ current_output_file_size(0),
+ total_bytes(0),
+ num_output_records(0),
+ approx_size(size),
+ grandparent_index(0),
+ overlapped_bytes(0),
+ seen_key(false) {
+ assert(compaction != nullptr);
+ }
+
+ SubcompactionState(SubcompactionState&& o) { *this = std::move(o); }
+
+ SubcompactionState& operator=(SubcompactionState&& o) {
+ compaction = std::move(o.compaction);
+ start = std::move(o.start);
+ end = std::move(o.end);
+ status = std::move(o.status);
+ outputs = std::move(o.outputs);
+ outfile = std::move(o.outfile);
+ builder = std::move(o.builder);
+ current_output_file_size = std::move(o.current_output_file_size);
+ total_bytes = std::move(o.total_bytes);
+ num_output_records = std::move(o.num_output_records);
+ compaction_job_stats = std::move(o.compaction_job_stats);
+ approx_size = std::move(o.approx_size);
+ grandparent_index = std::move(o.grandparent_index);
+ overlapped_bytes = std::move(o.overlapped_bytes);
+ seen_key = std::move(o.seen_key);
+ return *this;
+ }
+
+ // Because member std::unique_ptrs do not have these.
+ SubcompactionState(const SubcompactionState&) = delete;
+
+ SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+ // Returns true iff we should stop building the current output
+ // before processing "internal_key".
+ bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) {
+ const InternalKeyComparator* icmp =
+ &compaction->column_family_data()->internal_comparator();
+ const std::vector<FileMetaData*>& grandparents = compaction->grandparents();
+
+ // Scan to find earliest grandparent file that contains key.
+ while (grandparent_index < grandparents.size() &&
+ icmp->Compare(internal_key,
+ grandparents[grandparent_index]->largest.Encode()) >
+ 0) {
+ if (seen_key) {
+ overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize();
+ }
+ assert(grandparent_index + 1 >= grandparents.size() ||
+ icmp->Compare(
+ grandparents[grandparent_index]->largest.Encode(),
+ grandparents[grandparent_index + 1]->smallest.Encode()) <= 0);
+ grandparent_index++;
+ }
+ seen_key = true;
+
+ if (overlapped_bytes + curr_file_size >
+ compaction->max_compaction_bytes()) {
+ // Too much overlap for current output; start new output
+ overlapped_bytes = 0;
+ return true;
+ }
+
+ return false;
+ }
+};
+
+// Maintains state for the entire compaction
+struct CompactionJob::CompactionState {
+ Compaction* const compaction;
+
+ // REQUIRED: subcompaction states are stored in order of increasing
+ // key-range
+ std::vector<CompactionJob::SubcompactionState> sub_compact_states;
+ Status status;
+
+ uint64_t total_bytes;
+ uint64_t num_output_records;
+
+ explicit CompactionState(Compaction* c)
+ : compaction(c),
+ total_bytes(0),
+ num_output_records(0) {}
+
+ size_t NumOutputFiles() {
+ size_t total = 0;
+ for (auto& s : sub_compact_states) {
+ total += s.outputs.size();
+ }
+ return total;
+ }
+
+ Slice SmallestUserKey() {
+ for (const auto& sub_compact_state : sub_compact_states) {
+ if (!sub_compact_state.outputs.empty() &&
+ sub_compact_state.outputs[0].finished) {
+ return sub_compact_state.outputs[0].meta.smallest.user_key();
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice(nullptr, 0);
+ }
+
+ Slice LargestUserKey() {
+ for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+ ++it) {
+ if (!it->outputs.empty() && it->current_output()->finished) {
+ assert(it->current_output() != nullptr);
+ return it->current_output()->meta.largest.user_key();
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice(nullptr, 0);
+ }
+};
+
+void CompactionJob::AggregateStatistics() {
+ for (SubcompactionState& sc : compact_->sub_compact_states) {
+ compact_->total_bytes += sc.total_bytes;
+ compact_->num_output_records += sc.num_output_records;
+ }
+ if (compaction_job_stats_) {
+ for (SubcompactionState& sc : compact_->sub_compact_states) {
+ compaction_job_stats_->Add(sc.compaction_job_stats);
+ }
+ }
+}
+
+CompactionJob::CompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
+ Directory* db_directory, Directory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
+ EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
+ const std::string& dbname, CompactionJobStats* compaction_job_stats,
+ Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused)
+ : job_id_(job_id),
+ compact_(new CompactionState(compaction)),
+ compaction_job_stats_(compaction_job_stats),
+ compaction_stats_(compaction->compaction_reason(), 1),
+ dbname_(dbname),
+ db_options_(db_options),
+ file_options_(file_options),
+ env_(db_options.env),
+ fs_(db_options.fs.get()),
+ file_options_for_read_(
+ fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+ versions_(versions),
+ shutting_down_(shutting_down),
+ manual_compaction_paused_(manual_compaction_paused),
+ preserve_deletes_seqnum_(preserve_deletes_seqnum),
+ log_buffer_(log_buffer),
+ db_directory_(db_directory),
+ output_directory_(output_directory),
+ stats_(stats),
+ db_mutex_(db_mutex),
+ db_error_handler_(db_error_handler),
+ existing_snapshots_(std::move(existing_snapshots)),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ table_cache_(std::move(table_cache)),
+ event_logger_(event_logger),
+ bottommost_level_(false),
+ paranoid_file_checks_(paranoid_file_checks),
+ measure_io_stats_(measure_io_stats),
+ write_hint_(Env::WLTH_NOT_SET),
+ thread_pri_(thread_pri) {
+ assert(log_buffer_ != nullptr);
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+ ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+ assert(compact_ == nullptr);
+ ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+
+ ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+ job_id_);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+ (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+ compact_->compaction->output_level());
+
+ // In the current design, a CompactionJob is always created
+ // for non-trivial compaction.
+ assert(compaction->IsTrivialMove() == false ||
+ compaction->is_manual_compaction() == true);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_PROP_FLAGS,
+ compaction->is_manual_compaction() +
+ (compaction->deletion_compaction() << 1));
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+ compaction->CalculateTotalInputSize());
+
+ IOSTATS_RESET(bytes_written);
+ IOSTATS_RESET(bytes_read);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+ // Set the thread operation after operation properties
+ // to ensure GetThreadList() can always show them all together.
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ if (compaction_job_stats_) {
+ compaction_job_stats_->is_manual_compaction =
+ compaction->is_manual_compaction();
+ }
+}
+
+void CompactionJob::Prepare() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+ // Generate file_levels_ for compaction berfore making Iterator
+ auto* c = compact_->compaction;
+ assert(c->column_family_data() != nullptr);
+ assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+ compact_->compaction->level()) > 0);
+
+ write_hint_ =
+ c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+ bottommost_level_ = c->bottommost_level();
+
+ if (c->ShouldFormSubcompactions()) {
+ {
+ StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME);
+ GenSubcompactionBoundaries();
+ }
+ assert(sizes_.size() == boundaries_.size() + 1);
+
+ for (size_t i = 0; i <= boundaries_.size(); i++) {
+ Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
+ Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
+ compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
+ }
+ RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+ compact_->sub_compact_states.size());
+ } else {
+ compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
+ }
+}
+
+struct RangeWithSize {
+ Range range;
+ uint64_t size;
+
+ RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+ : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+ auto* c = compact_->compaction;
+ auto* cfd = c->column_family_data();
+ const Comparator* cfd_comparator = cfd->user_comparator();
+ std::vector<Slice> bounds;
+ int start_lvl = c->start_level();
+ int out_lvl = c->output_level();
+
+ // Add the starting and/or ending key of certain input files as a potential
+ // boundary
+ for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+ int lvl = c->level(lvl_idx);
+ if (lvl >= start_lvl && lvl <= out_lvl) {
+ const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+ size_t num_files = flevel->num_files;
+
+ if (num_files == 0) {
+ continue;
+ }
+
+ if (lvl == 0) {
+ // For level 0 add the starting and ending key of each file since the
+ // files may have greatly differing key ranges (not range-partitioned)
+ for (size_t i = 0; i < num_files; i++) {
+ bounds.emplace_back(flevel->files[i].smallest_key);
+ bounds.emplace_back(flevel->files[i].largest_key);
+ }
+ } else {
+ // For all other levels add the smallest/largest key in the level to
+ // encompass the range covered by that level
+ bounds.emplace_back(flevel->files[0].smallest_key);
+ bounds.emplace_back(flevel->files[num_files - 1].largest_key);
+ if (lvl == out_lvl) {
+ // For the last level include the starting keys of all files since
+ // the last level is the largest and probably has the widest key
+ // range. Since it's range partitioned, the ending key of one file
+ // and the starting key of the next are very close (or identical).
+ for (size_t i = 1; i < num_files; i++) {
+ bounds.emplace_back(flevel->files[i].smallest_key);
+ }
+ }
+ }
+ }
+ }
+
+ std::sort(bounds.begin(), bounds.end(),
+ [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+ return cfd_comparator->Compare(ExtractUserKey(a),
+ ExtractUserKey(b)) < 0;
+ });
+ // Remove duplicated entries from bounds
+ bounds.erase(
+ std::unique(bounds.begin(), bounds.end(),
+ [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+ return cfd_comparator->Compare(ExtractUserKey(a),
+ ExtractUserKey(b)) == 0;
+ }),
+ bounds.end());
+
+ // Combine consecutive pairs of boundaries into ranges with an approximate
+ // size of data covered by keys in that range
+ uint64_t sum = 0;
+ std::vector<RangeWithSize> ranges;
+ // Get input version from CompactionState since it's already referenced
+ // earlier in SetInputVersioCompaction::SetInputVersion and will not change
+ // when db_mutex_ is released below
+ auto* v = compact_->compaction->input_version();
+ for (auto it = bounds.begin();;) {
+ const Slice a = *it;
+ ++it;
+
+ if (it == bounds.end()) {
+ break;
+ }
+
+ const Slice b = *it;
+
+ // ApproximateSize could potentially create table reader iterator to seek
+ // to the index block and may incur I/O cost in the process. Unlock db
+ // mutex to reduce contention
+ db_mutex_->Unlock();
+ uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
+ b, start_lvl, out_lvl + 1,
+ TableReaderCaller::kCompaction);
+ db_mutex_->Lock();
+ ranges.emplace_back(a, b, size);
+ sum += size;
+ }
+
+ // Group the ranges into subcompactions
+ const double min_file_fill_percent = 4.0 / 5;
+ int base_level = v->storage_info()->base_level();
+ uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
+ sum / min_file_fill_percent /
+ MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl,
+ c->immutable_cf_options()->compaction_style, base_level,
+ c->immutable_cf_options()->level_compaction_dynamic_level_bytes)));
+ uint64_t subcompactions =
+ std::min({static_cast<uint64_t>(ranges.size()),
+ static_cast<uint64_t>(c->max_subcompactions()),
+ max_output_files});
+
+ if (subcompactions > 1) {
+ double mean = sum * 1.0 / subcompactions;
+ // Greedily add ranges to the subcompaction until the sum of the ranges'
+ // sizes becomes >= the expected mean size of a subcompaction
+ sum = 0;
+ for (size_t i = 0; i < ranges.size() - 1; i++) {
+ sum += ranges[i].size;
+ if (subcompactions == 1) {
+ // If there's only one left to schedule then it goes to the end so no
+ // need to put an end boundary
+ continue;
+ }
+ if (sum >= mean) {
+ boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
+ sizes_.emplace_back(sum);
+ subcompactions--;
+ sum = 0;
+ }
+ }
+ sizes_.emplace_back(sum + ranges.back().size);
+ } else {
+ // Only one range so its size is the total sum of sizes computed above
+ sizes_.emplace_back(sum);
+ }
+}
+
+Status CompactionJob::Run() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+ TEST_SYNC_POINT("CompactionJob::Run():Start");
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+
+ const size_t num_threads = compact_->sub_compact_states.size();
+ assert(num_threads > 0);
+ const uint64_t start_micros = env_->NowMicros();
+
+ // Launch a thread for each of subcompactions 1...num_threads-1
+ std::vector<port::Thread> thread_pool;
+ thread_pool.reserve(num_threads - 1);
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+ &compact_->sub_compact_states[i]);
+ }
+
+ // Always schedule the first subcompaction (whether or not there are also
+ // others) in the current thread to be efficient with resources
+ ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+ // Wait for all other threads (if there are any) to finish execution
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+
+ compaction_stats_.micros = env_->NowMicros() - start_micros;
+ compaction_stats_.cpu_micros = 0;
+ for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
+ compaction_stats_.cpu_micros +=
+ compact_->sub_compact_states[i].compaction_job_stats.cpu_micros;
+ }
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+ RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+ compaction_stats_.cpu_micros);
+
+ TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+ // Check if any thread encountered an error during execution
+ Status status;
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ break;
+ }
+ }
+
+ if (status.ok() && output_directory_) {
+ status = output_directory_->Fsync();
+ }
+
+ if (status.ok()) {
+ thread_pool.clear();
+ std::vector<const FileMetaData*> files_meta;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.outputs) {
+ files_meta.emplace_back(&output.meta);
+ }
+ }
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ auto prefix_extractor =
+ compact_->compaction->mutable_cf_options()->prefix_extractor.get();
+ std::atomic<size_t> next_file_meta_idx(0);
+ auto verify_table = [&](Status& output_status) {
+ while (true) {
+ size_t file_idx = next_file_meta_idx.fetch_add(1);
+ if (file_idx >= files_meta.size()) {
+ break;
+ }
+ // Verify that the table is usable
+ // We set for_compaction to false and don't OptimizeForCompactionTableRead
+ // here because this is a special case after we finish the table building
+ // No matter whether use_direct_io_for_flush_and_compaction is true,
+ // we will regard this verification as user reads since the goal is
+ // to cache it here for further user reads
+ InternalIterator* iter = cfd->table_cache()->NewIterator(
+ ReadOptions(), file_options_, cfd->internal_comparator(),
+ *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor,
+ /*table_reader_ptr=*/nullptr,
+ cfd->internal_stats()->GetFileReadHist(
+ compact_->compaction->output_level()),
+ TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+ /*skip_filters=*/false, compact_->compaction->output_level(),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr);
+ auto s = iter->status();
+
+ if (s.ok() && paranoid_file_checks_) {
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
+ s = iter->status();
+ }
+
+ delete iter;
+
+ if (!s.ok()) {
+ output_status = s;
+ break;
+ }
+ }
+ };
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(verify_table,
+ std::ref(compact_->sub_compact_states[i].status));
+ }
+ verify_table(compact_->sub_compact_states[0].status);
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ break;
+ }
+ }
+ }
+
+ TablePropertiesCollection tp;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.outputs) {
+ auto fn =
+ TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+ output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+ tp[fn] = output.table_properties;
+ }
+ }
+ compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+ // Finish up all book-keeping to unify the subcompaction results
+ AggregateStatistics();
+ UpdateCompactionStats();
+ RecordCompactionIOStats();
+ LogFlush(db_options_.info_log);
+ TEST_SYNC_POINT("CompactionJob::Run():End");
+
+ compact_->status = status;
+ return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_INSTALL);
+ db_mutex_->AssertHeld();
+ Status status = compact_->status;
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ cfd->internal_stats()->AddCompactionStats(
+ compact_->compaction->output_level(), thread_pri_, compaction_stats_);
+
+ if (status.ok()) {
+ status = InstallCompactionResults(mutable_cf_options);
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ auto vstorage = cfd->current()->storage_info();
+ const auto& stats = compaction_stats_;
+
+ double read_write_amp = 0.0;
+ double write_amp = 0.0;
+ double bytes_read_per_sec = 0;
+ double bytes_written_per_sec = 0;
+
+ if (stats.bytes_read_non_output_levels > 0) {
+ read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
+ stats.bytes_read_non_output_levels) /
+ static_cast<double>(stats.bytes_read_non_output_levels);
+ write_amp = stats.bytes_written /
+ static_cast<double>(stats.bytes_read_non_output_levels);
+ }
+ if (stats.micros > 0) {
+ bytes_read_per_sec =
+ (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
+ static_cast<double>(stats.micros);
+ bytes_written_per_sec =
+ stats.bytes_written / static_cast<double>(stats.micros);
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+ "files in(%d, %d) out(%d) "
+ "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+ "write-amplify(%.1f) %s, records in: %" PRIu64
+ ", records dropped: %" PRIu64 " output_compression: %s\n",
+ cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
+ bytes_written_per_sec, compact_->compaction->output_level(),
+ stats.num_input_files_in_non_output_levels,
+ stats.num_input_files_in_output_level, stats.num_output_files,
+ stats.bytes_read_non_output_levels / 1048576.0,
+ stats.bytes_read_output_level / 1048576.0,
+ stats.bytes_written / 1048576.0, read_write_amp, write_amp,
+ status.ToString().c_str(), stats.num_input_records,
+ stats.num_dropped_records,
+ CompressionTypeToString(compact_->compaction->output_compression())
+ .c_str());
+
+ UpdateCompactionJobStats(stats);
+
+ auto stream = event_logger_->LogToBuffer(log_buffer_);
+ stream << "job" << job_id_ << "event"
+ << "compaction_finished"
+ << "compaction_time_micros" << stats.micros
+ << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+ << compact_->compaction->output_level() << "num_output_files"
+ << compact_->NumOutputFiles() << "total_output_size"
+ << compact_->total_bytes << "num_input_records"
+ << stats.num_input_records << "num_output_records"
+ << compact_->num_output_records << "num_subcompactions"
+ << compact_->sub_compact_states.size() << "output_compression"
+ << CompressionTypeToString(compact_->compaction->output_compression());
+
+ if (compaction_job_stats_ != nullptr) {
+ stream << "num_single_delete_mismatches"
+ << compaction_job_stats_->num_single_del_mismatch;
+ stream << "num_single_delete_fallthrough"
+ << compaction_job_stats_->num_single_del_fallthru;
+ }
+
+ if (measure_io_stats_ && compaction_job_stats_ != nullptr) {
+ stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+ stream << "file_range_sync_nanos"
+ << compaction_job_stats_->file_range_sync_nanos;
+ stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+ stream << "file_prepare_write_nanos"
+ << compaction_job_stats_->file_prepare_write_nanos;
+ }
+
+ stream << "lsm_state";
+ stream.StartArray();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ CleanupCompaction();
+ return status;
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+ assert(sub_compact != nullptr);
+
+ uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000;
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+ // Create compaction filter and fail the compaction if
+ // IgnoreSnapshots() = false because it is not supported anymore
+ const CompactionFilter* compaction_filter =
+ cfd->ioptions()->compaction_filter;
+ std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+ if (compaction_filter == nullptr) {
+ compaction_filter_from_factory =
+ sub_compact->compaction->CreateCompactionFilter();
+ compaction_filter = compaction_filter_from_factory.get();
+ }
+ if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+ sub_compact->status = Status::NotSupported(
+ "CompactionFilter::IgnoreSnapshots() = false is not supported "
+ "anymore.");
+ return;
+ }
+
+ CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
+ existing_snapshots_);
+
+ // Although the v2 aggregator is what the level iterator(s) know about,
+ // the AddTombstones calls will be propagated down to the v1 aggregator.
+ std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
+ sub_compact->compaction, &range_del_agg, file_options_for_read_));
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+ // I/O measurement variables
+ PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+ const uint64_t kRecordStatsEvery = 1000;
+ uint64_t prev_write_nanos = 0;
+ uint64_t prev_fsync_nanos = 0;
+ uint64_t prev_range_sync_nanos = 0;
+ uint64_t prev_prepare_write_nanos = 0;
+ uint64_t prev_cpu_write_nanos = 0;
+ uint64_t prev_cpu_read_nanos = 0;
+ if (measure_io_stats_) {
+ prev_perf_level = GetPerfLevel();
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ prev_write_nanos = IOSTATS(write_nanos);
+ prev_fsync_nanos = IOSTATS(fsync_nanos);
+ prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+ prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+ prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+ }
+
+ MergeHelper merge(
+ env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
+ compaction_filter, db_options_.info_log.get(),
+ false /* internal key corruption is expected */,
+ existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+ snapshot_checker_, compact_->compaction->level(),
+ db_options_.statistics.get());
+
+ TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+
+ Slice* start = sub_compact->start;
+ Slice* end = sub_compact->end;
+ if (start != nullptr) {
+ IterKey start_iter;
+ start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+ input->Seek(start_iter.GetInternalKey());
+ } else {
+ input->SeekToFirst();
+ }
+
+ Status status;
+ sub_compact->c_iter.reset(new CompactionIterator(
+ input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
+ &existing_snapshots_, earliest_write_conflict_snapshot_,
+ snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
+ &range_del_agg, sub_compact->compaction, compaction_filter,
+ shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_,
+ db_options_.info_log));
+ auto c_iter = sub_compact->c_iter.get();
+ c_iter->SeekToFirst();
+ if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
+ // ShouldStopBefore() maintains state based on keys processed so far. The
+ // compaction loop always calls it on the "next" key, thus won't tell it the
+ // first key. So we do that here.
+ sub_compact->ShouldStopBefore(c_iter->key(),
+ sub_compact->current_output_file_size);
+ }
+ const auto& c_iter_stats = c_iter->iter_stats();
+
+ while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+ // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+ // returns true.
+ const Slice& key = c_iter->key();
+ const Slice& value = c_iter->value();
+
+ // If an end key (exclusive) is specified, check if the current key is
+ // >= than it and exit if it is because the iterator is out of its range
+ if (end != nullptr &&
+ cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
+ break;
+ }
+ if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+ kRecordStatsEvery - 1) {
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ c_iter->ResetRecordCounts();
+ RecordCompactionIOStats();
+ }
+
+ // Open output file if necessary
+ if (sub_compact->builder == nullptr) {
+ status = OpenCompactionOutputFile(sub_compact);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ assert(sub_compact->builder != nullptr);
+ assert(sub_compact->current_output() != nullptr);
+ sub_compact->builder->Add(key, value);
+ sub_compact->current_output_file_size = sub_compact->builder->FileSize();
+ const ParsedInternalKey& ikey = c_iter->ikey();
+ sub_compact->current_output()->meta.UpdateBoundaries(
+ key, value, ikey.sequence, ikey.type);
+ sub_compact->num_output_records++;
+
+ // Close output file if it is big enough. Two possibilities determine it's
+ // time to close it: (1) the current key should be this file's last key, (2)
+ // the next key should not be in this file.
+ //
+ // TODO(aekmekji): determine if file should be closed earlier than this
+ // during subcompactions (i.e. if output size, estimated by input size, is
+ // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
+ // and 0.6MB instead of 1MB and 0.2MB)
+ bool output_file_ended = false;
+ Status input_status;
+ if (sub_compact->compaction->output_level() != 0 &&
+ sub_compact->current_output_file_size >=
+ sub_compact->compaction->max_output_file_size()) {
+ // (1) this key terminates the file. For historical reasons, the iterator
+ // status before advancing will be given to FinishCompactionOutputFile().
+ input_status = input->status();
+ output_file_ended = true;
+ }
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:2",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+ c_iter->Next();
+ if (c_iter->status().IsManualCompactionPaused()) {
+ break;
+ }
+ if (!output_file_ended && c_iter->Valid() &&
+ sub_compact->compaction->output_level() != 0 &&
+ sub_compact->ShouldStopBefore(c_iter->key(),
+ sub_compact->current_output_file_size) &&
+ sub_compact->builder != nullptr) {
+ // (2) this key belongs to the next file. For historical reasons, the
+ // iterator status after advancing will be given to
+ // FinishCompactionOutputFile().
+ input_status = input->status();
+ output_file_ended = true;
+ }
+ if (output_file_ended) {
+ const Slice* next_key = nullptr;
+ if (c_iter->Valid()) {
+ next_key = &c_iter->key();
+ }
+ CompactionIterationStats range_del_out_stats;
+ status =
+ FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
+ &range_del_out_stats, next_key);
+ RecordDroppedKeys(range_del_out_stats,
+ &sub_compact->compaction_job_stats);
+ }
+ }
+
+ sub_compact->compaction_job_stats.num_input_deletion_records =
+ c_iter_stats.num_input_deletion_records;
+ sub_compact->compaction_job_stats.num_corrupt_keys =
+ c_iter_stats.num_input_corrupt_records;
+ sub_compact->compaction_job_stats.num_single_del_fallthru =
+ c_iter_stats.num_single_del_fallthru;
+ sub_compact->compaction_job_stats.num_single_del_mismatch =
+ c_iter_stats.num_single_del_mismatch;
+ sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+ c_iter_stats.total_input_raw_key_bytes;
+ sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+ c_iter_stats.total_input_raw_value_bytes;
+
+ RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+ c_iter_stats.total_filter_time);
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ RecordCompactionIOStats();
+
+ if (status.ok() && cfd->IsDropped()) {
+ status =
+ Status::ColumnFamilyDropped("Column family dropped during compaction");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ shutting_down_->load(std::memory_order_relaxed)) {
+ status = Status::ShutdownInProgress("Database shutdown");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ (manual_compaction_paused_ &&
+ manual_compaction_paused_->load(std::memory_order_relaxed))) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ if (status.ok()) {
+ status = c_iter->status();
+ }
+
+ if (status.ok() && sub_compact->builder == nullptr &&
+ sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) {
+ // handle subcompaction containing only range deletions
+ status = OpenCompactionOutputFile(sub_compact);
+ }
+
+ // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+ // close the output file.
+ if (sub_compact->builder != nullptr) {
+ CompactionIterationStats range_del_out_stats;
+ Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
+ &range_del_out_stats);
+ if (status.ok()) {
+ status = s;
+ }
+ RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+ }
+
+ sub_compact->compaction_job_stats.cpu_micros =
+ env_->NowCPUNanos() / 1000 - prev_cpu_micros;
+
+ if (measure_io_stats_) {
+ sub_compact->compaction_job_stats.file_write_nanos +=
+ IOSTATS(write_nanos) - prev_write_nanos;
+ sub_compact->compaction_job_stats.file_fsync_nanos +=
+ IOSTATS(fsync_nanos) - prev_fsync_nanos;
+ sub_compact->compaction_job_stats.file_range_sync_nanos +=
+ IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+ sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+ IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+ sub_compact->compaction_job_stats.cpu_micros -=
+ (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+ IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+ 1000;
+ if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+ SetPerfLevel(prev_perf_level);
+ }
+ }
+
+ sub_compact->c_iter.reset();
+ input.reset();
+ sub_compact->status = status;
+}
+
+void CompactionJob::RecordDroppedKeys(
+ const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats) {
+ if (c_iter_stats.num_record_drop_user > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+ c_iter_stats.num_record_drop_user);
+ }
+ if (c_iter_stats.num_record_drop_hidden > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+ c_iter_stats.num_record_drop_hidden);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_records_replaced +=
+ c_iter_stats.num_record_drop_hidden;
+ }
+ }
+ if (c_iter_stats.num_record_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+ c_iter_stats.num_record_drop_obsolete);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_expired_deletion_records +=
+ c_iter_stats.num_record_drop_obsolete;
+ }
+ }
+ if (c_iter_stats.num_record_drop_range_del > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+ c_iter_stats.num_record_drop_range_del);
+ }
+ if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_range_del_drop_obsolete);
+ }
+ if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_optimized_del_drop_obsolete);
+ }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+ const Status& input_status, SubcompactionState* sub_compact,
+ CompactionRangeDelAggregator* range_del_agg,
+ CompactionIterationStats* range_del_out_stats,
+ const Slice* next_table_min_key /* = nullptr */) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+ assert(sub_compact != nullptr);
+ assert(sub_compact->outfile);
+ assert(sub_compact->builder != nullptr);
+ assert(sub_compact->current_output() != nullptr);
+
+ uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
+ assert(output_number != 0);
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+ const Comparator* ucmp = cfd->user_comparator();
+
+ // Check for iterator errors
+ Status s = input_status;
+ auto meta = &sub_compact->current_output()->meta;
+ assert(meta != nullptr);
+ if (s.ok()) {
+ Slice lower_bound_guard, upper_bound_guard;
+ std::string smallest_user_key;
+ const Slice *lower_bound, *upper_bound;
+ bool lower_bound_from_sub_compact = false;
+ if (sub_compact->outputs.size() == 1) {
+ // For the first output table, include range tombstones before the min key
+ // but after the subcompaction boundary.
+ lower_bound = sub_compact->start;
+ lower_bound_from_sub_compact = true;
+ } else if (meta->smallest.size() > 0) {
+ // For subsequent output tables, only include range tombstones from min
+ // key onwards since the previous file was extended to contain range
+ // tombstones falling before min key.
+ smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/);
+ lower_bound_guard = Slice(smallest_user_key);
+ lower_bound = &lower_bound_guard;
+ } else {
+ lower_bound = nullptr;
+ }
+ if (next_table_min_key != nullptr) {
+ // This may be the last file in the subcompaction in some cases, so we
+ // need to compare the end key of subcompaction with the next file start
+ // key. When the end key is chosen by the subcompaction, we know that
+ // it must be the biggest key in output file. Therefore, it is safe to
+ // use the smaller key as the upper bound of the output file, to ensure
+ // that there is no overlapping between different output files.
+ upper_bound_guard = ExtractUserKey(*next_table_min_key);
+ if (sub_compact->end != nullptr &&
+ ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
+ upper_bound = sub_compact->end;
+ } else {
+ upper_bound = &upper_bound_guard;
+ }
+ } else {
+ // This is the last file in the subcompaction, so extend until the
+ // subcompaction ends.
+ upper_bound = sub_compact->end;
+ }
+ auto earliest_snapshot = kMaxSequenceNumber;
+ if (existing_snapshots_.size() > 0) {
+ earliest_snapshot = existing_snapshots_[0];
+ }
+ bool has_overlapping_endpoints;
+ if (upper_bound != nullptr && meta->largest.size() > 0) {
+ has_overlapping_endpoints =
+ ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
+ } else {
+ has_overlapping_endpoints = false;
+ }
+
+ // The end key of the subcompaction must be bigger or equal to the upper
+ // bound. If the end of subcompaction is null or the upper bound is null,
+ // it means that this file is the last file in the compaction. So there
+ // will be no overlapping between this file and others.
+ assert(sub_compact->end == nullptr ||
+ upper_bound == nullptr ||
+ ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
+ auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
+ has_overlapping_endpoints);
+ // Position the range tombstone output iterator. There may be tombstone
+ // fragments that are entirely out of range, so make sure that we do not
+ // include those.
+ if (lower_bound != nullptr) {
+ it->Seek(*lower_bound);
+ } else {
+ it->SeekToFirst();
+ }
+ for (; it->Valid(); it->Next()) {
+ auto tombstone = it->Tombstone();
+ if (upper_bound != nullptr) {
+ int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
+ if ((has_overlapping_endpoints && cmp < 0) ||
+ (!has_overlapping_endpoints && cmp <= 0)) {
+ // Tombstones starting after upper_bound only need to be included in
+ // the next table. If the current SST ends before upper_bound, i.e.,
+ // `has_overlapping_endpoints == false`, we can also skip over range
+ // tombstones that start exactly at upper_bound. Such range tombstones
+ // will be included in the next file and are not relevant to the point
+ // keys or endpoints of the current file.
+ break;
+ }
+ }
+
+ if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
+ // TODO(andrewkr): tombstones that span multiple output files are
+ // counted for each compaction output file, so lots of double counting.
+ range_del_out_stats->num_range_del_drop_obsolete++;
+ range_del_out_stats->num_record_drop_obsolete++;
+ continue;
+ }
+
+ auto kv = tombstone.Serialize();
+ assert(lower_bound == nullptr ||
+ ucmp->Compare(*lower_bound, kv.second) < 0);
+ sub_compact->builder->Add(kv.first.Encode(), kv.second);
+ InternalKey smallest_candidate = std::move(kv.first);
+ if (lower_bound != nullptr &&
+ ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
+ // Pretend the smallest key has the same user key as lower_bound
+ // (the max key in the previous table or subcompaction) in order for
+ // files to appear key-space partitioned.
+ //
+ // When lower_bound is chosen by a subcompaction, we know that
+ // subcompactions over smaller keys cannot contain any keys at
+ // lower_bound. We also know that smaller subcompactions exist, because
+ // otherwise the subcompaction woud be unbounded on the left. As a
+ // result, we know that no other files on the output level will contain
+ // actual keys at lower_bound (an output file may have a largest key of
+ // lower_bound@kMaxSequenceNumber, but this only indicates a large range
+ // tombstone was truncated). Therefore, it is safe to use the
+ // tombstone's sequence number, to ensure that keys at lower_bound at
+ // lower levels are covered by truncated tombstones.
+ //
+ // If lower_bound was chosen by the smallest data key in the file,
+ // choose lowest seqnum so this file's smallest internal key comes after
+ // the previous file's largest. The fake seqnum is OK because the read
+ // path's file-picking code only considers user key.
+ smallest_candidate = InternalKey(
+ *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
+ kTypeRangeDeletion);
+ }
+ InternalKey largest_candidate = tombstone.SerializeEndKey();
+ if (upper_bound != nullptr &&
+ ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
+ // Pretend the largest key has the same user key as upper_bound (the
+ // min key in the following table or subcompaction) in order for files
+ // to appear key-space partitioned.
+ //
+ // Choose highest seqnum so this file's largest internal key comes
+ // before the next file's/subcompaction's smallest. The fake seqnum is
+ // OK because the read path's file-picking code only considers the user
+ // key portion.
+ //
+ // Note Seek() also creates InternalKey with (user_key,
+ // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+ // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+ // Seek() key in InternalKey's ordering. So Seek() will look in the
+ // next file for the user key.
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+ }
+#ifndef NDEBUG
+ SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+ if (meta->smallest.size() > 0) {
+ smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode());
+ }
+#endif
+ meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+ tombstone.seq_,
+ cfd->internal_comparator());
+
+ // The smallest key in a file is used for range tombstone truncation, so
+ // it cannot have a seqnum of 0 (unless the smallest data key in a file
+ // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+ // deleted keys at lower levels.
+ assert(smallest_ikey_seqnum == 0 ||
+ ExtractInternalKeyFooter(meta->smallest.Encode()) !=
+ PackSequenceAndType(0, kTypeRangeDeletion));
+ }
+ meta->marked_for_compaction = sub_compact->builder->NeedCompact();
+ }
+ const uint64_t current_entries = sub_compact->builder->NumEntries();
+ if (s.ok()) {
+ s = sub_compact->builder->Finish();
+ } else {
+ sub_compact->builder->Abandon();
+ }
+ const uint64_t current_bytes = sub_compact->builder->FileSize();
+ if (s.ok()) {
+ // Add the checksum information to file metadata.
+ meta->file_checksum = sub_compact->builder->GetFileChecksum();
+ meta->file_checksum_func_name =
+ sub_compact->builder->GetFileChecksumFuncName();
+
+ meta->fd.file_size = current_bytes;
+ }
+ sub_compact->current_output()->finished = true;
+ sub_compact->total_bytes += current_bytes;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+ s = sub_compact->outfile->Sync(db_options_.use_fsync);
+ }
+ if (s.ok()) {
+ s = sub_compact->outfile->Close();
+ }
+ sub_compact->outfile.reset();
+
+ TableProperties tp;
+ if (s.ok()) {
+ tp = sub_compact->builder->GetTableProperties();
+ }
+
+ if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+ // If there is nothing to output, no necessary to generate a sst file.
+ // This happens when the output level is bottom level, at the same time
+ // the sub_compact output nothing.
+ std::string fname =
+ TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+ meta->fd.GetNumber(), meta->fd.GetPathId());
+ env_->DeleteFile(fname);
+
+ // Also need to remove the file from outputs, or it will be added to the
+ // VersionEdit.
+ assert(!sub_compact->outputs.empty());
+ sub_compact->outputs.pop_back();
+ meta = nullptr;
+ }
+
+ if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+ // Output to event logger and fire events.
+ sub_compact->current_output()->table_properties =
+ std::make_shared<TableProperties>(tp);
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+ " keys, %" PRIu64 " bytes%s",
+ cfd->GetName().c_str(), job_id_, output_number,
+ current_entries, current_bytes,
+ meta->marked_for_compaction ? " (need compaction)" : "");
+ }
+ std::string fname;
+ FileDescriptor output_fd;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ if (meta != nullptr) {
+ fname =
+ TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+ meta->fd.GetNumber(), meta->fd.GetPathId());
+ output_fd = meta->fd;
+ oldest_blob_file_number = meta->oldest_blob_file_number;
+ } else {
+ fname = "(nil)";
+ }
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+ job_id_, output_fd, oldest_blob_file_number, tp,
+ TableFileCreationReason::kCompaction, s);
+
+#ifndef ROCKSDB_LITE
+ // Report new file to SstFileManagerImpl
+ auto sfm =
+ static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+ if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+ sfm->OnAddFile(fname);
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ // TODO(ajkr): should we return OK() if max space was reached by the final
+ // compaction output file (similarly to how flush works when full)?
+ s = Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT(
+ "CompactionJob::FinishCompactionOutputFile:"
+ "MaxAllowedSpaceReached");
+ InstrumentedMutexLock l(db_mutex_);
+ db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+ }
+ }
+#endif
+
+ sub_compact->builder.reset();
+ sub_compact->current_output_file_size = 0;
+ return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+ const MutableCFOptions& mutable_cf_options) {
+ db_mutex_->AssertHeld();
+
+ auto* compaction = compact_->compaction;
+ // paranoia: verify that the files that we started with
+ // still exist in the current version and in the same original level.
+ // This ensures that a concurrent compaction did not erroneously
+ // pick the same files to compact_.
+ if (!versions_->VerifyCompactionFileConsistency(compaction)) {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+
+ ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted",
+ compaction->column_family_data()->GetName().c_str(),
+ job_id_, compaction->InputLevelSummary(&inputs_summary));
+ return Status::Corruption("Compaction input files inconsistent");
+ }
+
+ {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+ compaction->column_family_data()->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
+ }
+
+ // Add compaction inputs
+ compaction->AddInputDeletions(compact_->compaction->edit());
+
+ for (const auto& sub_compact : compact_->sub_compact_states) {
+ for (const auto& out : sub_compact.outputs) {
+ compaction->edit()->AddFile(compaction->output_level(), out.meta);
+ }
+ }
+ return versions_->LogAndApply(compaction->column_family_data(),
+ mutable_cf_options, compaction->edit(),
+ db_mutex_, db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+ RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+ IOSTATS_RESET(bytes_read);
+ RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+ IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(
+ SubcompactionState* sub_compact) {
+ assert(sub_compact != nullptr);
+ assert(sub_compact->builder == nullptr);
+ // no need to lock because VersionSet::next_file_number_ is atomic
+ uint64_t file_number = versions_->NewFileNumber();
+ std::string fname =
+ TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+ file_number, sub_compact->compaction->output_path_id());
+ // Fire events.
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+ EventHelpers::NotifyTableFileCreationStarted(
+ cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+ TableFileCreationReason::kCompaction);
+#endif // !ROCKSDB_LITE
+ // Make the output file
+ std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+ bool syncpoint_arg = file_options_.use_direct_writes;
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+ &syncpoint_arg);
+#endif
+ Status s = NewWritableFile(fs_, fname, &writable_file, file_options_);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+ " fails at NewWritableFile with status %s",
+ sub_compact->compaction->column_family_data()->GetName().c_str(),
+ job_id_, file_number, s.ToString().c_str());
+ LogFlush(db_options_.info_log);
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+ fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+ TableProperties(), TableFileCreationReason::kCompaction, s);
+ return s;
+ }
+
+ // Try to figure out the output file's oldest ancester time.
+ int64_t temp_current_time = 0;
+ auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+ // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+ if (!get_time_status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get current time. Status: %s",
+ get_time_status.ToString().c_str());
+ }
+ uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+ uint64_t oldest_ancester_time =
+ sub_compact->compaction->MinInputFileOldestAncesterTime();
+ if (oldest_ancester_time == port::kMaxUint64) {
+ oldest_ancester_time = current_time;
+ }
+
+ // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+ {
+ SubcompactionState::Output out;
+ out.meta.fd = FileDescriptor(file_number,
+ sub_compact->compaction->output_path_id(), 0);
+ out.meta.oldest_ancester_time = oldest_ancester_time;
+ out.meta.file_creation_time = current_time;
+ out.finished = false;
+ sub_compact->outputs.push_back(out);
+ }
+
+ writable_file->SetIOPriority(Env::IOPriority::IO_LOW);
+ writable_file->SetWriteLifeTimeHint(write_hint_);
+ writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+ sub_compact->compaction->OutputFilePreallocationSize()));
+ const auto& listeners =
+ sub_compact->compaction->immutable_cf_options()->listeners;
+ sub_compact->outfile.reset(
+ new WritableFileWriter(std::move(writable_file), fname, file_options_,
+ env_, db_options_.statistics.get(), listeners,
+ db_options_.sst_file_checksum_func.get()));
+
+ // If the Column family flag is to only optimize filters for hits,
+ // we can skip creating filters if this is the bottommost_level where
+ // data is going to be found
+ bool skip_filters =
+ cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
+
+ sub_compact->builder.reset(NewTableBuilder(
+ *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+ cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+ cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
+ sub_compact->compaction->output_compression(),
+ 0 /*sample_for_compression */,
+ sub_compact->compaction->output_compression_opts(),
+ sub_compact->compaction->output_level(), skip_filters,
+ oldest_ancester_time, 0 /* oldest_key_time */,
+ sub_compact->compaction->max_output_file_size(), current_time));
+ LogFlush(db_options_.info_log);
+ return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+ for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+ const auto& sub_status = sub_compact.status;
+
+ if (sub_compact.builder != nullptr) {
+ // May happen if we get a shutdown call in the middle of compaction
+ sub_compact.builder->Abandon();
+ sub_compact.builder.reset();
+ } else {
+ assert(!sub_status.ok() || sub_compact.outfile == nullptr);
+ }
+ for (const auto& out : sub_compact.outputs) {
+ // If this file was inserted into the table cache then remove
+ // them here because this compaction was not committed.
+ if (!sub_status.ok()) {
+ TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber());
+ }
+ }
+ }
+ delete compact_;
+ compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+} // namespace
+
+#endif // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+ Compaction* compaction = compact_->compaction;
+ compaction_stats_.num_input_files_in_non_output_levels = 0;
+ compaction_stats_.num_input_files_in_output_level = 0;
+ for (int input_level = 0;
+ input_level < static_cast<int>(compaction->num_input_levels());
+ ++input_level) {
+ if (compaction->level(input_level) != compaction->output_level()) {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.num_input_files_in_non_output_levels,
+ &compaction_stats_.bytes_read_non_output_levels, input_level);
+ } else {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.num_input_files_in_output_level,
+ &compaction_stats_.bytes_read_output_level, input_level);
+ }
+ }
+
+ uint64_t num_output_records = 0;
+
+ for (const auto& sub_compact : compact_->sub_compact_states) {
+ size_t num_output_files = sub_compact.outputs.size();
+ if (sub_compact.builder != nullptr) {
+ // An error occurred so ignore the last output.
+ assert(num_output_files > 0);
+ --num_output_files;
+ }
+ compaction_stats_.num_output_files += static_cast<int>(num_output_files);
+
+ num_output_records += sub_compact.num_output_records;
+
+ for (const auto& out : sub_compact.outputs) {
+ compaction_stats_.bytes_written += out.meta.fd.file_size;
+ }
+ }
+
+ if (compaction_stats_.num_input_records > num_output_records) {
+ compaction_stats_.num_dropped_records =
+ compaction_stats_.num_input_records - num_output_records;
+ }
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+ uint64_t* bytes_read,
+ int input_level) {
+ const Compaction* compaction = compact_->compaction;
+ auto num_input_files = compaction->num_input_files(input_level);
+ *num_files += static_cast<int>(num_input_files);
+
+ for (size_t i = 0; i < num_input_files; ++i) {
+ const auto* file_meta = compaction->input(input_level, i);
+ *bytes_read += file_meta->fd.GetFileSize();
+ compaction_stats_.num_input_records +=
+ static_cast<uint64_t>(file_meta->num_entries);
+ }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+ if (compaction_job_stats_) {
+ compaction_job_stats_->elapsed_micros = stats.micros;
+
+ // input information
+ compaction_job_stats_->total_input_bytes =
+ stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+ compaction_job_stats_->num_input_records = stats.num_input_records;
+ compaction_job_stats_->num_input_files =
+ stats.num_input_files_in_non_output_levels +
+ stats.num_input_files_in_output_level;
+ compaction_job_stats_->num_input_files_at_output_level =
+ stats.num_input_files_in_output_level;
+
+ // output information
+ compaction_job_stats_->total_output_bytes = stats.bytes_written;
+ compaction_job_stats_->num_output_records = compact_->num_output_records;
+ compaction_job_stats_->num_output_files = stats.num_output_files;
+
+ if (compact_->NumOutputFiles() > 0U) {
+ CopyPrefix(compact_->SmallestUserKey(),
+ CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->smallest_output_key_prefix);
+ CopyPrefix(compact_->LargestUserKey(),
+ CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->largest_output_key_prefix);
+ }
+ }
+#else
+ (void)stats;
+#endif // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+ Compaction* compaction = compact_->compaction;
+ ColumnFamilyData* cfd = compaction->column_family_data();
+
+ // Let's check if anything will get logged. Don't prepare all the info if
+ // we're not logging
+ if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+ cfd->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary), compaction->score());
+ char scratch[2345];
+ compaction->Summary(scratch, sizeof(scratch));
+ ROCKS_LOG_INFO(db_options_.info_log, "[%s] Compaction start summary: %s\n",
+ cfd->GetName().c_str(), scratch);
+ // build event logger report
+ auto stream = event_logger_->Log();
+ stream << "job" << job_id_ << "event"
+ << "compaction_started"
+ << "compaction_reason"
+ << GetCompactionReasonString(compaction->compaction_reason());
+ for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+ stream << ("files_L" + ToString(compaction->level(i)));
+ stream.StartArray();
+ for (auto f : *compaction->inputs(i)) {
+ stream << f->fd.GetNumber();
+ }
+ stream.EndArray();
+ }
+ stream << "score" << compaction->score() << "input_data_size"
+ << compaction->CalculateTotalInputSize();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..c15f502a1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+class CompactionJob {
+ public:
+ CompactionJob(int job_id, Compaction* compaction,
+ const ImmutableDBOptions& db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down,
+ const SequenceNumber preserve_deletes_seqnum,
+ LogBuffer* log_buffer, Directory* db_directory,
+ Directory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ bool paranoid_file_checks, bool measure_io_stats,
+ const std::string& dbname,
+ CompactionJobStats* compaction_job_stats,
+ Env::Priority thread_pri,
+ const std::atomic<bool>* manual_compaction_paused = nullptr);
+
+ ~CompactionJob();
+
+ // no copy/move
+ CompactionJob(CompactionJob&& job) = delete;
+ CompactionJob(const CompactionJob& job) = delete;
+ CompactionJob& operator=(const CompactionJob& job) = delete;
+
+ // REQUIRED: mutex held
+ // Prepare for the compaction by setting up boundaries for each subcompaction
+ void Prepare();
+ // REQUIRED mutex not held
+ // Launch threads for each subcompaction and wait for them to finish. After
+ // that, verify table is usable and finally do bookkeeping to unify
+ // subcompaction results
+ Status Run();
+
+ // REQUIRED: mutex held
+ // Add compaction input/output to the current version
+ Status Install(const MutableCFOptions& mutable_cf_options);
+
+ private:
+ struct SubcompactionState;
+
+ void AggregateStatistics();
+
+ // Generates a histogram representing potential divisions of key ranges from
+ // the input. It adds the starting and/or ending keys of certain input files
+ // to the working set and then finds the approximate size of data in between
+ // each consecutive pair of slices. Then it divides these ranges into
+ // consecutive groups such that each group has a similar size.
+ void GenSubcompactionBoundaries();
+
+ // update the thread status for starting a compaction.
+ void ReportStartedCompaction(Compaction* compaction);
+ void AllocateCompactionOutputFileNumbers();
+ // Call compaction filter. Then iterate through input and compact the
+ // kv-pairs
+ void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+ Status FinishCompactionOutputFile(
+ const Status& input_status, SubcompactionState* sub_compact,
+ CompactionRangeDelAggregator* range_del_agg,
+ CompactionIterationStats* range_del_out_stats,
+ const Slice* next_table_min_key = nullptr);
+ Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+ void RecordCompactionIOStats();
+ Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
+ void CleanupCompaction();
+ void UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const;
+ void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats = nullptr);
+
+ void UpdateCompactionStats();
+ void UpdateCompactionInputStatsHelper(
+ int* num_files, uint64_t* bytes_read, int input_level);
+
+ void LogCompaction();
+
+ int job_id_;
+
+ // CompactionJob state
+ struct CompactionState;
+ CompactionState* compact_;
+ CompactionJobStats* compaction_job_stats_;
+ InternalStats::CompactionStats compaction_stats_;
+
+ // DBImpl state
+ const std::string& dbname_;
+ const ImmutableDBOptions& db_options_;
+ const FileOptions file_options_;
+
+ Env* env_;
+ FileSystem* fs_;
+ // env_option optimized for compaction table reads
+ FileOptions file_options_for_read_;
+ VersionSet* versions_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>* manual_compaction_paused_;
+ const SequenceNumber preserve_deletes_seqnum_;
+ LogBuffer* log_buffer_;
+ Directory* db_directory_;
+ Directory* output_directory_;
+ Statistics* stats_;
+ InstrumentedMutex* db_mutex_;
+ ErrorHandler* db_error_handler_;
+ // If there were two snapshots with seq numbers s1 and
+ // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+ // entirely within s1 and s2, then the earlier version of k1 can be safely
+ // deleted because that version is not visible in any snapshot.
+ std::vector<SequenceNumber> existing_snapshots_;
+
+ // This is the earliest snapshot that could be used for write-conflict
+ // checking by a transaction. For any user-key newer than this snapshot, we
+ // should make sure not to remove evidence that a write occurred.
+ SequenceNumber earliest_write_conflict_snapshot_;
+
+ const SnapshotChecker* const snapshot_checker_;
+
+ std::shared_ptr<Cache> table_cache_;
+
+ EventLogger* event_logger_;
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level_;
+ bool paranoid_file_checks_;
+ bool measure_io_stats_;
+ // Stores the Slices that designate the boundaries for each subcompaction
+ std::vector<Slice> boundaries_;
+ // Stores the approx size of keys covered in the range of each subcompaction
+ std::vector<uint64_t> sizes_;
+ Env::WriteLifeTimeHint write_hint_;
+ Env::Priority thread_pri_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..51a665797
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,1043 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+ std::string r;
+ test::CompressibleString(rnd, ratio, len, &r);
+ return r;
+}
+
+std::string Key(uint64_t key, int length) {
+ const int kBufSize = 1000;
+ char buf[kBufSize];
+ if (length > kBufSize) {
+ length = kBufSize;
+ }
+ snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+ return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ Env* env_;
+ DB* db_;
+ std::vector<ColumnFamilyHandle*> handles_;
+ uint32_t max_subcompactions_;
+
+ Options last_options_;
+
+ CompactionJobStatsTest() : env_(Env::Default()) {
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+ alternative_wal_dir_ = dbname_ + "/wal";
+ Options options;
+ options.create_if_missing = true;
+ max_subcompactions_ = GetParam();
+ options.max_subcompactions = max_subcompactions_;
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ Reopen(options);
+ }
+
+ ~CompactionJobStatsTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ DBImpl* dbfull() {
+ return reinterpret_cast<DBImpl*>(db_);
+ }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+ }
+ }
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ Status TryReopenWithColumnFamilies(
+ const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+ }
+
+ void Reopen(const Options& options) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Close() {
+ for (auto h : handles_) {
+ delete h;
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(const Options& options) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+
+ Status ReadOnlyReopen(const Options& options) {
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+ }
+
+ Status TryReopen(const Options& options) {
+ Close();
+ last_options_ = options;
+ return DB::Open(options, dbname_, &db_);
+ }
+
+ Status Flush(int cf = 0) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+ }
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, k, v);
+ }
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+
+ Status Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+ }
+
+ Status Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level, int cf = 0) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(level), &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+ &property));
+ }
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf = 0) {
+ int num_levels =
+ (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < num_levels; level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+ Range r(start, limit);
+ uint64_t size;
+ if (cf == 0) {
+ db_->GetApproximateSizes(&r, 1, &size);
+ } else {
+ db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+ }
+ return size;
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id) {
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = target_path_id;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+ }
+
+ void Compact(const Slice& start, const Slice& limit) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+ }
+
+ void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+ true /* disallow trivial move */));
+ }
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large,
+ int cf = 0) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(Flush(cf));
+ }
+ }
+
+ static void SetDeletionCompactionStats(
+ CompactionJobStats *stats, uint64_t input_deletions,
+ uint64_t expired_deletions, uint64_t records_replaced) {
+ stats->num_input_deletion_records = input_deletions;
+ stats->num_expired_deletion_records = expired_deletions;
+ stats->num_records_replaced = records_replaced;
+ }
+
+ void MakeTableWithKeyValues(
+ Random* rnd, uint64_t smallest, uint64_t largest,
+ int key_size, int value_size, uint64_t interval,
+ double ratio, int cf = 0) {
+ for (auto key = smallest; key < largest; key += interval) {
+ ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+ Slice(RandomString(rnd, value_size, ratio))));
+ }
+ ASSERT_OK(Flush(cf));
+ }
+
+ // This function behaves with the implicit understanding that two
+ // rounds of keys are inserted into the database, as per the behavior
+ // of the DeletionStatsTest.
+ void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+ uint64_t interval, int deletion_interval, int key_size,
+ uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) {
+
+ // interval needs to be >= 2 so that deletion entries can be inserted
+ // that are intended to not result in an actual key deletion by using
+ // an offset of 1 from another existing key
+ ASSERT_GE(interval, 2);
+
+ uint64_t ctr = 1;
+ uint32_t deletions_made = 0;
+ uint32_t num_deleted = 0;
+ uint32_t num_expired = 0;
+ for (auto key = smallest; key <= largest; key += interval, ctr++) {
+ if (ctr % deletion_interval == 0) {
+ ASSERT_OK(Delete(cf, Key(key, key_size)));
+ deletions_made++;
+ num_deleted++;
+
+ if (key > cutoff_key_num) {
+ num_expired++;
+ }
+ }
+ }
+
+ // Insert some deletions for keys that don't exist that
+ // are both in and out of the key range
+ ASSERT_OK(Delete(cf, Key(smallest+1, key_size)));
+ deletions_made++;
+
+ ASSERT_OK(Delete(cf, Key(smallest-1, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Delete(cf, Key(smallest-9, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Flush(cf));
+ SetDeletionCompactionStats(stats, deletions_made, num_expired,
+ num_deleted);
+ }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+ CompactionJobStatsChecker()
+ : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+ size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+ void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+ // Once a compaction completed, this function will verify the returned
+ // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+ // in "expected_stats_" which has not yet being used for verification.
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ if (verify_next_comp_io_stats_) {
+ ASSERT_GT(ci.stats.file_write_nanos, 0);
+ ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+ ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+ ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+ verify_next_comp_io_stats_ = false;
+ }
+
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (expected_stats_.size()) {
+ Verify(ci.stats, expected_stats_.front());
+ expected_stats_.pop();
+ }
+ }
+
+ // A helper function which verifies whether two CompactionJobStats
+ // match. The verification of all compaction stats are done by
+ // ASSERT_EQ except for the total input / output bytes, which we
+ // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+ // 10% in uncompressed case and 20% when compression is used.
+ virtual void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) {
+ // time
+ ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(current_stats.num_input_records,
+ stats.num_input_records);
+ ASSERT_EQ(current_stats.num_input_files,
+ stats.num_input_files);
+ ASSERT_EQ(current_stats.num_input_files_at_output_level,
+ stats.num_input_files_at_output_level);
+
+ ASSERT_EQ(current_stats.num_output_records,
+ stats.num_output_records);
+ ASSERT_EQ(current_stats.num_output_files,
+ stats.num_output_files);
+
+ ASSERT_EQ(current_stats.is_manual_compaction,
+ stats.is_manual_compaction);
+
+ // file size
+ double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+ ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+ stats.total_input_bytes);
+ ASSERT_LE(current_stats.total_input_bytes,
+ stats.total_input_bytes * (1.00 + kFileSizeBias));
+ ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+ stats.total_output_bytes);
+ ASSERT_LE(current_stats.total_output_bytes,
+ stats.total_output_bytes * (1.00 + kFileSizeBias));
+ ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+ stats.total_input_raw_key_bytes);
+ ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+ stats.total_input_raw_value_bytes);
+
+ ASSERT_EQ(current_stats.num_records_replaced,
+ stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys,
+ stats.num_corrupt_keys);
+
+ ASSERT_EQ(
+ std::string(current_stats.smallest_output_key_prefix),
+ std::string(stats.smallest_output_key_prefix));
+ ASSERT_EQ(
+ std::string(current_stats.largest_output_key_prefix),
+ std::string(stats.largest_output_key_prefix));
+ }
+
+ // Add an expected compaction stats, which will be used to
+ // verify the CompactionJobStats returned by the OnCompactionCompleted()
+ // callback.
+ void AddExpectedStats(const CompactionJobStats& stats) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ expected_stats_.push(stats);
+ }
+
+ void EnableCompression(bool flag) {
+ compression_enabled_ = flag;
+ }
+
+ bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+ std::mutex mutex_;
+ std::queue<CompactionJobStats> expected_stats_;
+ bool compression_enabled_;
+ bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+ // Verifies whether two CompactionJobStats match.
+ void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) override {
+ ASSERT_EQ(
+ current_stats.num_input_deletion_records,
+ stats.num_input_deletion_records);
+ ASSERT_EQ(
+ current_stats.num_expired_deletion_records,
+ stats.num_expired_deletion_records);
+ ASSERT_EQ(
+ current_stats.num_records_replaced,
+ stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys,
+ stats.num_corrupt_keys);
+ }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(
+ uint64_t num_records, size_t key_size, size_t value_size,
+ double compression_ratio = 1.0,
+ size_t block_size = 4096,
+ int bloom_bits_per_key = 10) {
+ const size_t kPerKeyOverhead = 8;
+ const size_t kFooterSize = 512;
+
+ uint64_t data_size =
+ static_cast<uint64_t>(
+ num_records * (key_size + value_size * compression_ratio +
+ kPerKeyOverhead));
+
+ return data_size + kFooterSize
+ + num_records * bloom_bits_per_key / 8 // filter block
+ + data_size * (key_size + 8) / block_size; // index block
+}
+
+namespace {
+
+void CopyPrefix(
+ const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+
+} // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+ const std::string& smallest_key, const std::string& largest_key,
+ size_t num_input_files, size_t num_input_files_at_output_level,
+ uint64_t num_input_records, size_t key_size, size_t value_size,
+ size_t num_output_files, uint64_t num_output_records,
+ double compression_ratio, uint64_t num_records_replaced,
+ bool is_manual = true) {
+ CompactionJobStats stats;
+ stats.Reset();
+
+ stats.num_input_records = num_input_records;
+ stats.num_input_files = num_input_files;
+ stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+ stats.num_output_records = num_output_records;
+ stats.num_output_files = num_output_files;
+
+ stats.total_input_bytes =
+ EstimatedFileSize(
+ num_input_records / num_input_files,
+ key_size, value_size, compression_ratio) * num_input_files;
+ stats.total_output_bytes =
+ EstimatedFileSize(
+ num_output_records / num_output_files,
+ key_size, value_size, compression_ratio) * num_output_files;
+ stats.total_input_raw_key_bytes =
+ num_input_records * (key_size + 8);
+ stats.total_input_raw_value_bytes =
+ num_input_records * value_size;
+
+ stats.is_manual_compaction = is_manual;
+
+ stats.num_records_replaced = num_records_replaced;
+
+ CopyPrefix(smallest_key,
+ CompactionJobStats::kMaxPrefixLength,
+ &stats.smallest_output_key_prefix);
+ CopyPrefix(largest_key,
+ CompactionJobStats::kMaxPrefixLength,
+ &stats.largest_output_key_prefix);
+
+ return stats;
+}
+
+CompressionType GetAnyCompression() {
+ if (Snappy_Supported()) {
+ return kSnappyCompression;
+ } else if (Zlib_Supported()) {
+ return kZlibCompression;
+ } else if (BZip2_Supported()) {
+ return kBZip2Compression;
+ } else if (LZ4_Supported()) {
+ return kLZ4Compression;
+ } else if (XPRESS_Supported()) {
+ return kXpressCompression;
+ }
+
+ return kNoCompression;
+}
+
+} // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+ Random rnd(301);
+ const int kBufSize = 100;
+ char buf[kBufSize];
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 100;
+ const int kTestScale = 8;
+ const int kKeySize = 10;
+ const int kValueSize = 1000;
+ const double kCompressionRatio = 0.5;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect. The expected CompactionJobStats is added
+ // via AddExpectedStats().
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ // just enough setting to hold off auto-compaction.
+ options.level0_file_num_compaction_trigger = kTestScale + 1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options.bytes_per_sync = 512 * 1024;
+
+ options.report_bg_io_stats = true;
+ for (int test = 0; test < 2; ++test) {
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // 1st Phase: generate "num_L0_files" L0 files.
+ int num_L0_files = 0;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d", ++num_L0_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+ ASSERT_EQ(ToString(num_L0_files), FilesPerLevel(1));
+
+ // 2nd Phase: perform L0 -> L1 compaction.
+ int L0_compaction_count = 6;
+ int count = 1;
+ std::string smallest_key;
+ std::string largest_key;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * L0_compaction_count;
+ start_key += key_base, count++) {
+ smallest_key = Key(start_key, 10);
+ largest_key = Key(start_key + key_base - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ smallest_key, largest_key,
+ 1, 0, num_keys_per_L0_file,
+ kKeySize, kValueSize,
+ 1, num_keys_per_L0_file,
+ compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // compact two files into one in the last L0 -> L1 compaction
+ int num_remaining_L0 = num_L0_files - L0_compaction_count;
+ smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ smallest_key, largest_key,
+ num_remaining_L0,
+ 0, num_keys_per_L0_file * num_remaining_L0,
+ kKeySize, kValueSize,
+ 1, num_keys_per_L0_file * num_remaining_L0,
+ compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+
+ int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+ num_L0_files = 0;
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+ // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+ int sparseness = 2;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base * sparseness) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base * sparseness - 1,
+ kKeySize, kValueSize,
+ key_base * sparseness / num_keys_per_L0_file,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+ // When subcompactions are enabled, the number of output files increases
+ // by 1 because multiple threads are consuming the input and generating
+ // output files without coordinating to see if the output could fit into
+ // a smaller number of files like it does when it runs sequentially
+ int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+ for (uint64_t start_key = key_base;
+ num_L0_files > 1;
+ start_key += key_base * sparseness) {
+ smallest_key = Key(start_key, 10);
+ largest_key =
+ Key(start_key + key_base * sparseness - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ smallest_key, largest_key,
+ 3, 2, num_keys_per_L0_file * 3,
+ kKeySize, kValueSize,
+ num_output_files,
+ num_keys_per_L0_file * 2, // 1/3 of the data will be updated.
+ compression_ratio,
+ num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+ if (options.max_subcompactions == 1) {
+ --num_L1_files;
+ }
+ snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+ // Here we expect to have 1 L0 files and 4 L1 files
+ // In the first sub-compaction, we expect L0 compaction.
+ smallest_key = Key(key_base, 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key,
+ 2, 1, num_keys_per_L0_file * 3,
+ kKeySize, kValueSize,
+ 1, num_keys_per_L0_file * 2,
+ compression_ratio,
+ num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+
+ num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+ char L1_buf[4];
+ snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+ std::string L1_files(L1_buf);
+ ASSERT_EQ(L1_files, FilesPerLevel(1));
+ options.compression = GetAnyCompression();
+ if (options.compression == kNoCompression) {
+ break;
+ }
+ stats_checker->EnableCompression(true);
+ compression_ratio = kCompressionRatio;
+
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+ Slice(RandomString(&rnd, 512 * 1024, 1))));
+ }
+
+ ASSERT_OK(Flush(1));
+ reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+
+ stats_checker->set_verify_next_comp_io_stats(true);
+ std::atomic<bool> first_prepare_write(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+ if (first_prepare_write.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_prepare_write.store(false);
+ }
+ });
+
+ std::atomic<bool> first_flush(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+ if (first_flush.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_flush.store(false);
+ }
+ });
+
+ std::atomic<bool> first_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+ if (first_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_sync.store(false);
+ }
+ });
+
+ std::atomic<bool> first_range_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+ if (first_range_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_range_sync.store(false);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Compact(1, smallest_key, largest_key);
+
+ ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+ ASSERT_TRUE(!first_prepare_write.load());
+ ASSERT_TRUE(!first_flush.load());
+ ASSERT_TRUE(!first_sync.load());
+ ASSERT_TRUE(!first_range_sync.load());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 20;
+ const int kTestScale = 8; // make sure this is even
+ const int kKeySize = 10;
+ const int kValueSize = 100;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+ uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+ uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+ const std::string smallest_key = Key(key_base - 10, kKeySize);
+ const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect.
+ auto* stats_checker = new CompactionJobDeletionStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = kTestScale+1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Stage 1: Generate several L0 files and then send them to L2 by
+ // using CompactRangeOptions and CompactRange(). These files will
+ // have a strict subset of the keys from the full key-range
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale / 2;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ }
+
+ CompactRangeOptions cr_options;
+ cr_options.change_level = true;
+ cr_options.target_level = 2;
+ db_->CompactRange(cr_options, handles_[1], nullptr, nullptr);
+ ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+ // Stage 2: Generate files including keys from the entire key range
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ }
+
+ // Send these L0 files to L1
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+ // Add a new record and flush so now there is a L0 file
+ // with a value too (not just deletions from the next step)
+ ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test"));
+ ASSERT_OK(Flush(1));
+
+ // Stage 3: Generate L0 files with some deletions so now
+ // there are files with the same key range in L0, L1, and L2
+ int deletion_interval = 3;
+ CompactionJobStats first_compaction_stats;
+ SelectivelyDeleteKeys(key_base, largest_key_num,
+ key_interval, deletion_interval, kKeySize, cutoff_key_num,
+ &first_compaction_stats, 1);
+
+ stats_checker->AddExpectedStats(first_compaction_stats);
+
+ // Stage 4: Trigger compaction and verify the stats
+ TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+ uint32_t compaction_input_units;
+ for (compaction_input_units = 1;
+ num_flushes >= compaction_input_units;
+ compaction_input_units *= 2) {
+ if ((num_flushes & compaction_input_units) != 0) {
+ return compaction_input_units > 1 ? compaction_input_units : 0;
+ }
+ }
+ return 0;
+}
+} // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_table = 100;
+ const uint32_t kTestScale = 6;
+ const int kKeySize = 10;
+ const int kValueSize = 900;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_table;
+
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = num_keys_per_table * 1000;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 1;
+ options.compaction_options_universal.max_size_amplification_percent = 1000;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Generates the expected CompactionJobStats for each compaction
+ for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+ // Here we treat one newly flushed file as an unit.
+ //
+ // For example, if a newly flushed file is 100k, and a compaction has
+ // 4 input units, then this compaction inputs 400k.
+ uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+ if (num_input_units == 0) {
+ continue;
+ }
+ // The following statement determines the expected smallest key
+ // based on whether it is a full compaction. A full compaction only
+ // happens when the number of flushes equals to the number of compaction
+ // input runs.
+ uint64_t smallest_key =
+ (num_flushes == num_input_units) ?
+ key_base : key_base * (num_flushes - 1);
+
+ stats_checker->AddExpectedStats(
+ NewManualCompactionJobStats(
+ Key(smallest_key, 10),
+ Key(smallest_key + key_base * num_input_units - key_interval, 10),
+ num_input_units,
+ num_input_units > 2 ? num_input_units / 2 : 0,
+ num_keys_per_table * num_input_units,
+ kKeySize, kValueSize,
+ num_input_units,
+ num_keys_per_table * num_input_units,
+ 1.0, 0, false));
+ dbfull()->TEST_WaitForCompact();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+ ::testing::Values(1, 4));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..e7b46ef97
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,1082 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob_index.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+ const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+ ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+ ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+ ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+ ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+ ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif // !defined(IOS_CROSS_COMPILE)
+}
+
+} // namespace
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public testing::Test {
+ public:
+ CompactionJobTest()
+ : env_(Env::Default()),
+ fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
+ dbname_(test::PerThreadDBPath("compaction_job_test")),
+ db_options_(),
+ mutable_cf_options_(cf_options_),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ versions_(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr)),
+ shutting_down_(false),
+ preserve_deletes_seqnum_(0),
+ mock_table_factory_(new mock::MockTableFactory()),
+ error_handler_(nullptr, db_options_, &mutex_) {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ std::string GenerateFileName(uint64_t file_number) {
+ FileMetaData meta;
+ std::vector<DbPath> db_paths;
+ db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+ meta.fd = FileDescriptor(file_number, 0, 0);
+ return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+ }
+
+ static std::string KeyStr(const std::string& user_key,
+ const SequenceNumber seq_num, const ValueType t) {
+ return InternalKey(user_key, seq_num, t).Encode().ToString();
+ }
+
+ static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size) {
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size, uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+ size, kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrInlinedTTL(const Slice& value,
+ uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+ return blob_index;
+ }
+
+ void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
+ assert(contents.size() > 0);
+
+ bool first_key = true;
+ std::string smallest, largest;
+ InternalKey smallest_key, largest_key;
+ SequenceNumber smallest_seqno = kMaxSequenceNumber;
+ SequenceNumber largest_seqno = 0;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ for (auto kv : contents) {
+ ParsedInternalKey key;
+ std::string skey;
+ std::string value;
+ std::tie(skey, value) = kv;
+ bool parsed = ParseInternalKey(skey, &key);
+
+ smallest_seqno = std::min(smallest_seqno, key.sequence);
+ largest_seqno = std::max(largest_seqno, key.sequence);
+
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+ smallest.assign(key.user_key.data(), key.user_key.size());
+ smallest_key.DecodeFrom(skey);
+ }
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+ largest.assign(key.user_key.data(), key.user_key.size());
+ largest_key.DecodeFrom(skey);
+ }
+
+ first_key = false;
+
+ if (parsed && key.type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ continue;
+ }
+
+ if (blob_index.IsInlined() || blob_index.HasTTL() ||
+ blob_index.file_number() == kInvalidBlobFileNumber) {
+ continue;
+ }
+
+ if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+ oldest_blob_file_number > blob_index.file_number()) {
+ oldest_blob_file_number = blob_index.file_number();
+ }
+ }
+ }
+
+ uint64_t file_number = versions_->NewFileNumber();
+ EXPECT_OK(mock_table_factory_->CreateMockTable(
+ env_, GenerateFileName(file_number), std::move(contents)));
+
+ VersionEdit edit;
+ edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
+ smallest_seqno, largest_seqno, false, oldest_blob_file_number,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+
+ mutex_.Lock();
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, &edit, &mutex_);
+ mutex_.Unlock();
+ }
+
+ void SetLastSequence(const SequenceNumber sequence_number) {
+ versions_->SetLastAllocatedSequence(sequence_number + 1);
+ versions_->SetLastPublishedSequence(sequence_number + 1);
+ versions_->SetLastSequence(sequence_number + 1);
+ }
+
+ // returns expected result after compaction
+ stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) {
+ auto expected_results = mock::MakeMockFile();
+ const int kKeysPerFile = 10000;
+ const int kCorruptKeysPerFile = 200;
+ const int kMatchingKeys = kKeysPerFile / 2;
+ SequenceNumber sequence_number = 0;
+
+ auto corrupt_id = [&](int id) {
+ return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+ };
+
+ for (int i = 0; i < 2; ++i) {
+ auto contents = mock::MakeMockFile();
+ for (int k = 0; k < kKeysPerFile; ++k) {
+ auto key = ToString(i * kMatchingKeys + k);
+ auto value = ToString(i * kKeysPerFile + k);
+ InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+ // This is how the key will look like once it's written in bottommost
+ // file
+ InternalKey bottommost_internal_key(
+ key, 0, kTypeValue);
+
+ if (corrupt_id(k)) {
+ test::CorruptKeyType(&internal_key);
+ test::CorruptKeyType(&bottommost_internal_key);
+ }
+ contents.insert({ internal_key.Encode().ToString(), value });
+ if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+ expected_results.insert(
+ { bottommost_internal_key.Encode().ToString(), value });
+ }
+ }
+
+ AddMockFile(contents);
+ }
+
+ SetLastSequence(sequence_number);
+
+ return expected_results;
+ }
+
+ void NewDB() {
+ DestroyDB(dbname_, Options());
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_,
+ &write_controller_,
+ /*block_cache_tracer=*/nullptr));
+ compaction_job_stats_.Reset();
+ SetIdentityFile(env_, dbname_);
+
+ VersionEdit new_db;
+ if (db_options_.write_dbid_to_manifest) {
+ DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+ std::string db_id;
+ impl->GetDbIdentityFromIdentityFile(&db_id);
+ new_db.SetDBId(db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ std::unique_ptr<WritableFile> file;
+ Status s = env_->NewWritableFile(
+ manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+ ASSERT_OK(s);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
+ {
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ }
+ ASSERT_OK(s);
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1, nullptr);
+
+ std::vector<ColumnFamilyDescriptor> column_families;
+ cf_options_.table_factory = mock_table_factory_;
+ cf_options_.merge_operator = merge_op_;
+ cf_options_.compaction_filter = compaction_filter_.get();
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+ EXPECT_OK(versions_->Recover(column_families, false));
+ cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+ }
+
+ void RunCompaction(
+ const std::vector<std::vector<FileMetaData*>>& input_files,
+ const stl_wrappers::KVMap& expected_results,
+ const std::vector<SequenceNumber>& snapshots = {},
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ int output_level = 1, bool verify = true,
+ uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+ size_t num_input_files = 0;
+ std::vector<CompactionInputFiles> compaction_input_files;
+ for (size_t level = 0; level < input_files.size(); level++) {
+ auto level_files = input_files[level];
+ CompactionInputFiles compaction_level;
+ compaction_level.level = static_cast<int>(level);
+ compaction_level.files.insert(compaction_level.files.end(),
+ level_files.begin(), level_files.end());
+ compaction_input_files.push_back(compaction_level);
+ num_input_files += level_files.size();
+ }
+
+ Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions(),
+ compaction_input_files, output_level, 1024 * 1024,
+ 10 * 1024 * 1024, 0, kNoCompression,
+ cfd->ioptions()->compression_opts, 0, {}, true);
+ compaction.SetInputVersion(cfd->current());
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+ mutex_.Lock();
+ EventLogger event_logger(db_options_.info_log.get());
+ // TODO(yiwu) add a mock snapshot checker and add test for it.
+ SnapshotChecker* snapshot_checker = nullptr;
+ CompactionJob compaction_job(
+ 0, &compaction, db_options_, env_options_, versions_.get(),
+ &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr,
+ nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+ earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+ &event_logger, false, false, dbname_, &compaction_job_stats_,
+ Env::Priority::USER);
+ VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+ compaction_job.Prepare();
+ mutex_.Unlock();
+ Status s;
+ s = compaction_job.Run();
+ ASSERT_OK(s);
+ mutex_.Lock();
+ ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+ mutex_.Unlock();
+
+ if (verify) {
+ ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+ ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+ if (expected_results.empty()) {
+ ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+ } else {
+ ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
+ mock_table_factory_->AssertLatestFile(expected_results);
+
+ auto output_files =
+ cfd->current()->storage_info()->LevelFiles(output_level);
+ ASSERT_EQ(output_files.size(), 1);
+ ASSERT_EQ(output_files[0]->oldest_blob_file_number,
+ expected_oldest_blob_file_number);
+ }
+ }
+ }
+
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname_;
+ EnvOptions env_options_;
+ ImmutableDBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ MutableCFOptions mutable_cf_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ std::unique_ptr<VersionSet> versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ SequenceNumber preserve_deletes_seqnum_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+ CompactionJobStats compaction_job_stats_;
+ ColumnFamilyData* cfd_;
+ std::unique_ptr<CompactionFilter> compaction_filter_;
+ std::shared_ptr<MergeOperator> merge_op_;
+ ErrorHandler error_handler_;
+};
+
+TEST_F(CompactionJobTest, Simple) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto files = cfd->current()->storage_info()->LevelFiles(0);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({ files }, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleCorrupted) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(true);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ auto files = cfd->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+ ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+ {KeyStr("c", 3U, kTypeValue), "val"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+ {KeyStr("b", 1U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+ SetLastSequence(4U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile();
+
+ SetLastSequence(4U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 3U, kTypeValue), "val2"},
+ {KeyStr("b", 4U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+ {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+ SetLastSequence(4U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ // Because level 1 is not the last level, the sequence numbers of a and b
+ // cannot be set to 0
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+ RunCompaction({lvl0_files, lvl1_files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeValue), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeMerge), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered
+ });
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+ {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+ {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+ {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+ // b does not appear because the operands are filtered
+ });
+
+ SetLastSequence(5U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file3, 2);
+
+ SetLastSequence(11U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+
+ stl_wrappers::KVMap empty_map;
+ RunCompaction({files}, empty_map);
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeDeletion), ""},
+ {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("a", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+ SetLastSequence(6U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), "val1"},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 21U, kTypeValue), "val3"},
+ {KeyStr("d", 8U, kTypeValue), "val4"},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("A", 1U, kTypeValue), "val"},
+ {KeyStr("e", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 21U, kTypeValue), ""},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+ NewDB();
+
+ // Test multiple snapshots where the earliest snapshot is not a
+ // write-conflic-snapshot.
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), "val"},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), "val"},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), "val"},
+ {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 23U, kTypeValue), "val2"},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 23U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), "val2"},
+ {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 32U, kTypeValue), "val3"},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 13U, kTypeValue), "val"},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val3"},
+ {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 13U, kTypeValue), "val2"},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val5"},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), ""},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), ""},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), ""},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), ""},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+
+ SetLastSequence(24U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {10U, 20U, 30U}, 20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("dummy", 5U, kTypeValue), "val2"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 0U, kTypeValue), "val"},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("dummy", 0U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+ // Tests three scenarios involving multiple single delete/put pairs:
+ //
+ // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+ // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+ // C: SDel Put SDel Snapshot Put -> Snapshot Put
+ // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+ // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+ // F: Put SDel Put Sdel Snapshot -> removed
+ // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+ // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+ // I: (Put) Snapshot Put SDel Put SDel -> SDel
+ // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+ // -> Snapshot Put
+ // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+ // -> Snapshot Put Snapshot SDel
+ // L: SDel Put Del Put SDel Snapshot Del Put Del SDel Put SDel
+ // -> Snapshot SDel
+ // M: (Put) SDel Put Del Put SDel Snapshot Put Del SDel Put SDel Del
+ // -> SDel Snapshot Del
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val5"},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), "val4"},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val"},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val"},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 12U, kTypeValue), "val"},
+ {KeyStr("J", 11U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), "val1"},
+ {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 12U, kTypeValue), "val2"},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), "val"},
+ {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 13U, kTypeDeletion), ""},
+ {KeyStr("L", 12U, kTypeValue), "val"},
+ {KeyStr("L", 11U, kTypeDeletion), ""},
+ {KeyStr("M", 16U, kTypeDeletion), ""},
+ {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 14U, kTypeValue), "val"},
+ {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 12U, kTypeDeletion), ""},
+ {KeyStr("M", 11U, kTypeValue), "val"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 11U, kTypeValue), "val2"},
+ {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 9U, kTypeValue), "val6"},
+ {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), "val"},
+ {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 4U, kTypeValue), "val"},
+ {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 5U, kTypeValue), "val"},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 5U, kTypeValue), "val"},
+ {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 3U, kTypeValue), "val"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val"},
+ {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 4U, kTypeValue), "val"},
+ {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 2U, kTypeValue), "val"},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("K", 7U, kTypeValue), "val4"},
+ {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 5U, kTypeValue), "val5"},
+ {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 4U, kTypeValue), "val"},
+ {KeyStr("L", 3U, kTypeDeletion), ""},
+ {KeyStr("L", 2U, kTypeValue), "val"},
+ {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 7U, kTypeValue), "val"},
+ {KeyStr("M", 5U, kTypeDeletion), ""},
+ {KeyStr("M", 4U, kTypeValue), "val"},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("D", 1U, kTypeValue), "val"},
+ {KeyStr("H", 1U, kTypeValue), "val"},
+ {KeyStr("I", 2U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({
+ {KeyStr("M", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file4, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), ""},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), ""},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), ""},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), ""},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), ""},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), ""},
+ {KeyStr("M", 16U, kTypeDeletion), ""},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+ SetLastSequence(22U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+ {test::KeyStr("a", 5U, kTypeDeletion), ""},
+ {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+ {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+ {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+ SetLastSequence(6U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+ NewDB();
+
+ // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+ // of identifying the oldest referenced blob file. Similarly, blob6 will be
+ // ignored because it has TTL and hence refers to a TTL blob file.
+ const stl_wrappers::KVMap::value_type blob1(
+ KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+ const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+ BlobStr(59, 123456, 999));
+ const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+ BlobStr(138, 1000, 1 << 8));
+ auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+ AddMockFile(file1);
+
+ const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+ BlobStr(199, 3 << 10, 1 << 20));
+ const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+ BlobStr(19, 6789, 333));
+ const stl_wrappers::KVMap::value_type blob6(
+ KeyStr("f", 6U, kTypeBlobIndex),
+ BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+ auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+ AddMockFile(file2);
+
+ const stl_wrappers::KVMap::value_type expected_blob1(
+ KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+ const stl_wrappers::KVMap::value_type expected_blob2(
+ KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+ const stl_wrappers::KVMap::value_type expected_blob3(
+ KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+ const stl_wrappers::KVMap::value_type expected_blob4(
+ KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+ const stl_wrappers::KVMap::value_type expected_blob5(
+ KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+ const stl_wrappers::KVMap::value_type expected_blob6(
+ KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+ auto expected_results =
+ mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+ expected_blob4, expected_blob5, expected_blob6});
+
+ SetLastSequence(6U);
+ auto files = cfd_->current()->storage_info()->LevelFiles(0);
+ RunCompaction({files}, expected_results, std::vector<SequenceNumber>(),
+ kMaxSequenceNumber, /* output_level */ 1, /* verify */ true,
+ /* expected_oldest_blob_file_number */ 19);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..4355d4b91
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1131 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->compensated_file_size;
+ }
+ return sum;
+}
+} // anonymous namespace
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+ size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file,
+ uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno) {
+ // Do not pick ingested file when there is at least one memtable not flushed
+ // which of seqno is overlap with the sst.
+ TEST_SYNC_POINT("FindIntraL0Compaction");
+ size_t start = 0;
+ for (; start < level_files.size(); start++) {
+ if (level_files[start]->being_compacted) {
+ return false;
+ }
+ // If there is no data in memtable, the earliest sequence number would the
+ // largest sequence number in last memtable.
+ // Because all files are sorted in descending order by largest_seqno, so we
+ // only need to check the first one.
+ if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+ break;
+ }
+ }
+ if (start >= level_files.size()) {
+ return false;
+ }
+ size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+ uint64_t compensated_compact_bytes =
+ level_files[start]->compensated_file_size;
+ size_t compact_bytes_per_del_file = port::kMaxSizet;
+ // Compaction range will be [start, limit).
+ size_t limit;
+ // Pull in files until the amount of compaction work per deleted file begins
+ // increasing or maximum total compaction size is reached.
+ size_t new_compact_bytes_per_del_file = 0;
+ for (limit = start + 1; limit < level_files.size(); ++limit) {
+ compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+ compensated_compact_bytes += level_files[limit]->compensated_file_size;
+ new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+ if (level_files[limit]->being_compacted ||
+ new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+ compensated_compact_bytes > max_compaction_bytes) {
+ break;
+ }
+ compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+ }
+
+ if ((limit - start) >= min_files_to_compact &&
+ compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+ assert(comp_inputs != nullptr);
+ comp_inputs->level = 0;
+ for (size_t i = start; i < limit; ++i) {
+ comp_inputs->files.push_back(level_files[i]);
+ }
+ return true;
+ }
+ return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ // disable compression
+ return kNoCompression;
+ }
+
+ // If bottommost_compression is set and we are compacting to the
+ // bottommost level then we should use it.
+ if (ioptions.bottommost_compression != kDisableCompressionOption &&
+ level >= (vstorage->num_non_empty_levels() - 1)) {
+ return ioptions.bottommost_compression;
+ }
+ // If the user has specified a different compression level for each level,
+ // then pick the compression for that level.
+ if (!ioptions.compression_per_level.empty()) {
+ assert(level == 0 || level >= base_level);
+ int idx = (level == 0) ? 0 : level - base_level + 1;
+
+ const int n = static_cast<int>(ioptions.compression_per_level.size()) - 1;
+ // It is possible for level_ to be -1; in that case, we use level
+ // 0's compression. This occurs mostly in backwards compatibility
+ // situations when the builder doesn't know what level the file
+ // belongs to. Likewise, if level is beyond the end of the
+ // specified compression levels, use the last value.
+ return ioptions.compression_per_level[std::max(0, std::min(idx, n))];
+ } else {
+ return mutable_cf_options.compression;
+ }
+}
+
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ int level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ return ioptions.compression_opts;
+ }
+ // If bottommost_compression is set and we are compacting to the
+ // bottommost level then we should use the specified compression options
+ // for the bottmomost_compression.
+ if (ioptions.bottommost_compression != kDisableCompressionOption &&
+ level >= (vstorage->num_non_empty_levels() - 1) &&
+ ioptions.bottommost_compression_opts.enabled) {
+ return ioptions.bottommost_compression_opts;
+ }
+ return ioptions.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+ UnregisterCompaction(c);
+ if (!status.ok()) {
+ c->ResetNextCompactionIndex();
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ const int level = inputs.level;
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+
+ if (level == 0) {
+ for (size_t i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_->Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_->Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+ } else {
+ *smallest = inputs[0]->smallest;
+ *largest = inputs[inputs.size() - 1]->largest;
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ assert(!inputs1.empty() || !inputs2.empty());
+ if (inputs1.empty()) {
+ GetRange(inputs2, smallest, largest);
+ } else if (inputs2.empty()) {
+ GetRange(inputs1, smallest, largest);
+ } else {
+ InternalKey smallest1, smallest2, largest1, largest2;
+ GetRange(inputs1, &smallest1, &largest1);
+ GetRange(inputs2, &smallest2, &largest2);
+ *smallest =
+ icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+ *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+ }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ InternalKey current_smallest;
+ InternalKey current_largest;
+ bool initialized = false;
+ for (const auto& in : inputs) {
+ if (in.empty()) {
+ continue;
+ }
+ GetRange(in, &current_smallest, &current_largest);
+ if (!initialized) {
+ *smallest = current_smallest;
+ *largest = current_largest;
+ initialized = true;
+ } else {
+ if (icmp_->Compare(current_smallest, *smallest) < 0) {
+ *smallest = current_smallest;
+ }
+ if (icmp_->Compare(current_largest, *largest) > 0) {
+ *largest = current_largest;
+ }
+ }
+ }
+ assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest) {
+ // This isn't good compaction
+ assert(!inputs->empty());
+
+ const int level = inputs->level;
+ // GetOverlappingInputs will always do the right thing for level-0.
+ // So we don't need to do any expansion if level == 0.
+ if (level == 0) {
+ return true;
+ }
+
+ InternalKey smallest, largest;
+
+ // Keep expanding inputs until we are sure that there is a "clean cut"
+ // boundary between the files in input and the surrounding files.
+ // This will ensure that no parts of a key are lost during compaction.
+ int hint_index = -1;
+ size_t old_size;
+ do {
+ old_size = inputs->size();
+ GetRange(*inputs, &smallest, &largest);
+ inputs->clear();
+ vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+ hint_index, &hint_index, true,
+ next_smallest);
+ } while (inputs->size() > old_size);
+
+ // we started off with inputs non-empty and the previous loop only grew
+ // inputs. thus, inputs should be non-empty here
+ assert(!inputs->empty());
+
+ // If, after the expansion, there are files that are already under
+ // compaction, then we must drop/cancel this compaction.
+ if (AreFilesInCompaction(inputs->files)) {
+ return false;
+ }
+ return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int level) const {
+ const Comparator* ucmp = icmp_->user_comparator();
+ for (Compaction* c : compactions_in_progress_) {
+ if (c->output_level() == level &&
+ ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) <= 0 &&
+ ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) >= 0) {
+ // Overlap
+ return true;
+ }
+ }
+ // Did not overlap with any running compaction in level `level`
+ return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level) const {
+ bool is_empty = true;
+ for (auto& in : inputs) {
+ if (!in.empty()) {
+ is_empty = false;
+ break;
+ }
+ }
+ if (is_empty) {
+ // No files in inputs
+ return false;
+ }
+
+ InternalKey smallest, largest;
+ GetRange(inputs, &smallest, &largest);
+ return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+ level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+ const std::vector<FileMetaData*>& files) {
+ for (size_t i = 0; i < files.size(); i++) {
+ if (files[i]->being_compacted) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files, int output_level,
+ VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+ uint32_t output_path_id) {
+ assert(input_files.size());
+ // This compaction output should not overlap with a running compaction as
+ // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+ // shouldn't have been released since.
+ assert(!FilesRangeOverlapWithCompaction(input_files, output_level));
+
+ CompressionType compression_type;
+ if (compact_options.compression == kDisableCompressionOption) {
+ int base_level;
+ if (ioptions_.compaction_style == kCompactionStyleLevel) {
+ base_level = vstorage->base_level();
+ } else {
+ base_level = 1;
+ }
+ compression_type =
+ GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+ output_level, base_level);
+ } else {
+ // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+ // without configurable `CompressionOptions`, which is inconsistent.
+ compression_type = compact_options.compression;
+ }
+ auto c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, input_files, output_level,
+ compact_options.output_file_size_limit,
+ mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+ GetCompressionOptions(ioptions_, vstorage, output_level),
+ compact_options.max_subcompactions,
+ /* grandparents */ {}, true);
+ RegisterCompaction(c);
+ return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+ const CompactionOptions& /*compact_options*/) const {
+ if (input_set->size() == 0U) {
+ return Status::InvalidArgument(
+ "Compaction must include at least one file.");
+ }
+ assert(input_files);
+
+ std::vector<CompactionInputFiles> matched_input_files;
+ matched_input_files.resize(vstorage->num_levels());
+ int first_non_empty_level = -1;
+ int last_non_empty_level = -1;
+ // TODO(yhchiang): use a lazy-initialized mapping from
+ // file_number to FileMetaData in Version.
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ for (auto file : vstorage->LevelFiles(level)) {
+ auto iter = input_set->find(file->fd.GetNumber());
+ if (iter != input_set->end()) {
+ matched_input_files[level].files.push_back(file);
+ input_set->erase(iter);
+ last_non_empty_level = level;
+ if (first_non_empty_level == -1) {
+ first_non_empty_level = level;
+ }
+ }
+ }
+ }
+
+ if (!input_set->empty()) {
+ std::string message(
+ "Cannot find matched SST files for the following file numbers:");
+ for (auto fn : *input_set) {
+ message += " ";
+ message += ToString(fn);
+ }
+ return Status::InvalidArgument(message);
+ }
+
+ for (int level = first_non_empty_level; level <= last_non_empty_level;
+ ++level) {
+ matched_input_files[level].level = level;
+ input_files->emplace_back(std::move(matched_input_files[level]));
+ }
+
+ return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest,
+ int level, int* level_index) {
+ std::vector<FileMetaData*> inputs;
+ assert(level < NumberLevels());
+
+ vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+ level_index ? *level_index : 0, level_index);
+ return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs, int* parent_index,
+ int base_index) {
+ assert(!inputs->empty());
+ assert(output_level_inputs->empty());
+ const int input_level = inputs->level;
+ const int output_level = output_level_inputs->level;
+ if (input_level == output_level) {
+ // no possibility of conflict
+ return true;
+ }
+
+ // For now, we only support merging two levels, start level and output level.
+ // We need to assert other levels are empty.
+ for (int l = input_level + 1; l < output_level; l++) {
+ assert(vstorage->NumLevelFiles(l) == 0);
+ }
+
+ InternalKey smallest, largest;
+
+ // Get the range one last time.
+ GetRange(*inputs, &smallest, &largest);
+
+ // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+ // include in compaction
+ vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+ &output_level_inputs->files, *parent_index,
+ parent_index);
+ if (AreFilesInCompaction(output_level_inputs->files)) {
+ return false;
+ }
+ if (!output_level_inputs->empty()) {
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+ return false;
+ }
+ }
+
+ // See if we can further grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up. We also choose NOT
+ // to expand if this would cause "level" to include some entries for some
+ // user key, while excluding other entries for the same user key. This
+ // can happen when one user key spans multiple files.
+ if (!output_level_inputs->empty()) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ const uint64_t output_level_inputs_size =
+ TotalCompensatedFileSize(output_level_inputs->files);
+ const uint64_t inputs_size = TotalCompensatedFileSize(inputs->files);
+ bool expand_inputs = false;
+
+ CompactionInputFiles expanded_inputs;
+ expanded_inputs.level = input_level;
+ // Get closed interval of output level
+ InternalKey all_start, all_limit;
+ GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+ bool try_overlapping_inputs = true;
+ vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+ &expanded_inputs.files, base_index, nullptr);
+ uint64_t expanded_inputs_size =
+ TotalCompensatedFileSize(expanded_inputs.files);
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+ try_overlapping_inputs = false;
+ }
+ if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+ output_level_inputs_size + expanded_inputs_size < limit &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded_inputs, &new_start, &new_limit);
+ CompactionInputFiles expanded_output_level_inputs;
+ expanded_output_level_inputs.level = output_level;
+ vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+ &expanded_output_level_inputs.files,
+ *parent_index, parent_index);
+ assert(!expanded_output_level_inputs.empty());
+ if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+ ExpandInputsToCleanCut(cf_name, vstorage,
+ &expanded_output_level_inputs) &&
+ expanded_output_level_inputs.size() == output_level_inputs->size()) {
+ expand_inputs = true;
+ }
+ }
+ if (!expand_inputs) {
+ vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+ &all_limit, &expanded_inputs.files,
+ base_index, nullptr);
+ expanded_inputs_size = TotalCompensatedFileSize(expanded_inputs.files);
+ if (expanded_inputs.size() > inputs->size() &&
+ output_level_inputs_size + expanded_inputs_size < limit &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ expand_inputs = true;
+ }
+ }
+ if (expand_inputs) {
+ ROCKS_LOG_INFO(ioptions_.info_log,
+ "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+ "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+ "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+ cf_name.c_str(), input_level, inputs->size(),
+ output_level_inputs->size(), inputs_size,
+ output_level_inputs_size, expanded_inputs.size(),
+ output_level_inputs->size(), expanded_inputs_size,
+ output_level_inputs_size);
+ inputs->files = expanded_inputs.files;
+ }
+ }
+ return true;
+}
+
+void CompactionPicker::GetGrandparents(
+ VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents) {
+ InternalKey start, limit;
+ GetRange(inputs, output_level_inputs, &start, &limit);
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2)
+ if (output_level_inputs.level + 1 < NumberLevels()) {
+ vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start,
+ &limit, grandparents);
+ }
+}
+
+Compaction* CompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+ const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore) {
+ // CompactionPickerFIFO has its own implementation of compact range
+ assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+ if (input_level == ColumnFamilyData::kCompactAllLevels) {
+ assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+ // Universal compaction with more than one level always compacts all the
+ // files together to the last level.
+ assert(vstorage->num_levels() > 1);
+ // DBImpl::CompactRange() set output level to be the last level
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level == vstorage->num_levels() - 2);
+ } else {
+ assert(output_level == vstorage->num_levels() - 1);
+ }
+ // DBImpl::RunManualCompaction will make full range for universal compaction
+ assert(begin == nullptr);
+ assert(end == nullptr);
+ *compaction_end = nullptr;
+
+ int start_level = 0;
+ for (; start_level < vstorage->num_levels() &&
+ vstorage->NumLevelFiles(start_level) == 0;
+ start_level++) {
+ }
+ if (start_level == vstorage->num_levels()) {
+ return nullptr;
+ }
+
+ if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ *manual_conflict = true;
+ // Only one level 0 compaction allowed
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+ start_level);
+ for (int level = start_level; level < vstorage->num_levels(); level++) {
+ inputs[level - start_level].level = level;
+ auto& files = inputs[level - start_level].files;
+ for (FileMetaData* f : vstorage->LevelFiles(level)) {
+ files.push_back(f);
+ }
+ if (AreFilesInCompaction(files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(inputs, output_level)) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style),
+ /* max_compaction_bytes */ LLONG_MAX,
+ compact_range_options.target_path_id,
+ GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+ output_level, 1),
+ GetCompressionOptions(ioptions_, vstorage, output_level),
+ compact_range_options.max_subcompactions, /* grandparents */ {},
+ /* is manual */ true);
+ RegisterCompaction(c);
+ return c;
+ }
+
+ CompactionInputFiles inputs;
+ inputs.level = input_level;
+ bool covering_the_whole_range = true;
+
+ // All files are 'overlapping' in universal style compaction.
+ // We have to compact the entire range in one shot.
+ if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ begin = nullptr;
+ end = nullptr;
+ }
+
+ vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+ if (inputs.empty()) {
+ return nullptr;
+ }
+
+ if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ // Only one level 0 compaction allowed
+ TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ // Avoid compacting too much in one shot in case the range is large.
+ // But we cannot do this for level-0 since level-0 files can overlap
+ // and we must not pick one file and drop another older file if the
+ // two files overlap.
+ if (input_level > 0) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ uint64_t total = 0;
+ for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+ uint64_t s = inputs[i]->compensated_file_size;
+ total += s;
+ if (total >= limit) {
+ covering_the_whole_range = false;
+ inputs.files.resize(i + 1);
+ break;
+ }
+ }
+ }
+ assert(compact_range_options.target_path_id <
+ static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+ // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+ // files that are created during the current compaction.
+ if (compact_range_options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kForceOptimized &&
+ max_file_num_to_ignore != port::kMaxUint64) {
+ assert(input_level == output_level);
+ // inputs_shrunk holds a continuous subset of input files which were all
+ // created before the current manual compaction
+ std::vector<FileMetaData*> inputs_shrunk;
+ size_t skip_input_index = inputs.size();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ inputs_shrunk.push_back(inputs[i]);
+ } else if (!inputs_shrunk.empty()) {
+ // inputs[i] was created during the current manual compaction and
+ // need to be skipped
+ skip_input_index = i;
+ break;
+ }
+ }
+ if (inputs_shrunk.empty()) {
+ return nullptr;
+ }
+ if (inputs.size() != inputs_shrunk.size()) {
+ inputs.files.swap(inputs_shrunk);
+ }
+ // set covering_the_whole_range to false if there is any file that need to
+ // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+ for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ covering_the_whole_range = false;
+ }
+ }
+ }
+
+ InternalKey key_storage;
+ InternalKey* next_smallest = &key_storage;
+ if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+ false) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that ExpandWhileOverlapping fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ if (covering_the_whole_range || !next_smallest) {
+ *compaction_end = nullptr;
+ } else {
+ **compaction_end = *next_smallest;
+ }
+
+ CompactionInputFiles output_level_inputs;
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ assert(input_level == 0);
+ output_level = vstorage->base_level();
+ assert(output_level > 0);
+ }
+ output_level_inputs.level = output_level;
+ if (input_level != output_level) {
+ int parent_index = -1;
+ if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+ &output_level_inputs, &parent_index, -1)) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that SetupOtherInputs fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ std::vector<CompactionInputFiles> compaction_inputs({inputs});
+ if (!output_level_inputs.empty()) {
+ compaction_inputs.push_back(output_level_inputs);
+ }
+ for (size_t i = 0; i < compaction_inputs.size(); i++) {
+ if (AreFilesInCompaction(compaction_inputs[i].files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(compaction_inputs, output_level)) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+ Compaction* compaction = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style, vstorage->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options.max_compaction_bytes,
+ compact_range_options.target_path_id,
+ GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
+ vstorage->base_level()),
+ GetCompressionOptions(ioptions_, vstorage, output_level),
+ compact_range_options.max_subcompactions, std::move(grandparents),
+ /* is manual compaction */ true);
+
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+ RegisterCompaction(compaction);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+ return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+ const SstFileMetaData& b) {
+ if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+ if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+} // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ auto& levels = cf_meta.levels;
+ auto comparator = icmp_->user_comparator();
+
+ // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+ // the smallest and largest key of the current compaction input
+ std::string smallestkey;
+ std::string largestkey;
+ // a flag for initializing smallest and largest key
+ bool is_first = false;
+ const int kNotFound = -1;
+
+ // For each level, it does the following things:
+ // 1. Find the first and the last compaction input files
+ // in the current level.
+ // 2. Include all files between the first and the last
+ // compaction input files.
+ // 3. Update the compaction key-range.
+ // 4. For all remaining levels, include files that have
+ // overlapping key-range with the compaction key-range.
+ for (int l = 0; l <= output_level; ++l) {
+ auto& current_files = levels[l].files;
+ int first_included = static_cast<int>(current_files.size());
+ int last_included = kNotFound;
+
+ // identify the first and the last compaction input files
+ // in the current level.
+ for (size_t f = 0; f < current_files.size(); ++f) {
+ if (input_files->find(TableFileNameToNumber(current_files[f].name)) !=
+ input_files->end()) {
+ first_included = std::min(first_included, static_cast<int>(f));
+ last_included = std::max(last_included, static_cast<int>(f));
+ if (is_first == false) {
+ smallestkey = current_files[f].smallestkey;
+ largestkey = current_files[f].largestkey;
+ is_first = true;
+ }
+ }
+ }
+ if (last_included == kNotFound) {
+ continue;
+ }
+
+ if (l != 0) {
+ // expend the compaction input of the current level if it
+ // has overlapping key-range with other non-compaction input
+ // files in the same level.
+ while (first_included > 0) {
+ if (comparator->Compare(current_files[first_included - 1].largestkey,
+ current_files[first_included].smallestkey) <
+ 0) {
+ break;
+ }
+ first_included--;
+ }
+
+ while (last_included < static_cast<int>(current_files.size()) - 1) {
+ if (comparator->Compare(current_files[last_included + 1].smallestkey,
+ current_files[last_included].largestkey) > 0) {
+ break;
+ }
+ last_included++;
+ }
+ } else if (output_level > 0) {
+ last_included = static_cast<int>(current_files.size() - 1);
+ }
+
+ // include all files between the first and the last compaction input files.
+ for (int f = first_included; f <= last_included; ++f) {
+ if (current_files[f].being_compacted) {
+ return Status::Aborted("Necessary compaction input file " +
+ current_files[f].name +
+ " is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(current_files[f].name));
+ }
+
+ // update smallest and largest key
+ if (l == 0) {
+ for (int f = first_included; f <= last_included; ++f) {
+ if (comparator->Compare(smallestkey, current_files[f].smallestkey) >
+ 0) {
+ smallestkey = current_files[f].smallestkey;
+ }
+ if (comparator->Compare(largestkey, current_files[f].largestkey) < 0) {
+ largestkey = current_files[f].largestkey;
+ }
+ }
+ } else {
+ if (comparator->Compare(smallestkey,
+ current_files[first_included].smallestkey) > 0) {
+ smallestkey = current_files[first_included].smallestkey;
+ }
+ if (comparator->Compare(largestkey,
+ current_files[last_included].largestkey) < 0) {
+ largestkey = current_files[last_included].largestkey;
+ }
+ }
+
+ SstFileMetaData aggregated_file_meta;
+ aggregated_file_meta.smallestkey = smallestkey;
+ aggregated_file_meta.largestkey = largestkey;
+
+ // For all lower levels, include all overlapping files.
+ // We need to add overlapping files from the current level too because even
+ // if there no input_files in level l, we would still need to add files
+ // which overlap with the range containing the input_files in levels 0 to l
+ // Level 0 doesn't need to be handled this way because files are sorted by
+ // time and not by key
+ for (int m = std::max(l, 1); m <= output_level; ++m) {
+ for (auto& next_lv_file : levels[m].files) {
+ if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+ next_lv_file)) {
+ if (next_lv_file.being_compacted) {
+ return Status::Aborted(
+ "File " + next_lv_file.name +
+ " that has overlapping key range with one of the compaction "
+ " input file is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(next_lv_file.name));
+ }
+ }
+ }
+ }
+ if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+ return Status::Aborted(
+ "A running compaction is writing to the same output level in an "
+ "overlapping key range");
+ }
+ return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+ cf_meta.levels[cf_meta.levels.size() - 1].level);
+ if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+ return Status::InvalidArgument(
+ "Output level for column family " + cf_meta.name +
+ " must between [0, " +
+ ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+ }
+
+ if (output_level > MaxOutputLevel()) {
+ return Status::InvalidArgument(
+ "Exceed the maximum output level defined by "
+ "the current compaction algorithm --- " +
+ ToString(MaxOutputLevel()));
+ }
+
+ if (output_level < 0) {
+ return Status::InvalidArgument("Output level cannot be negative.");
+ }
+
+ if (input_files->size() == 0) {
+ return Status::InvalidArgument(
+ "A compaction must contain at least one file.");
+ }
+
+ Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+ output_level);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // for all input files, check whether the file number matches
+ // any currently-existing files.
+ for (auto file_num : *input_files) {
+ bool found = false;
+ for (const auto& level_meta : cf_meta.levels) {
+ for (const auto& file_meta : level_meta.files) {
+ if (file_num == TableFileNameToNumber(file_meta.name)) {
+ if (file_meta.being_compacted) {
+ return Status::Aborted("Specified compaction input file " +
+ MakeTableFileName("", file_num) +
+ " is already being compacted.");
+ }
+ found = true;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (!found) {
+ return Status::InvalidArgument(
+ "Specified compaction input file " + MakeTableFileName("", file_num) +
+ " does not exist in column family " + cf_meta.name + ".");
+ }
+ }
+
+ return Status::OK();
+}
+#endif // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+ c->output_level() == 0 ||
+ !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level()));
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.insert(c);
+ }
+ compactions_in_progress_.insert(c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.erase(c);
+ }
+ compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+ const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+ int* output_level, CompactionInputFiles* start_level_inputs) {
+ if (vstorage->FilesMarkedForCompaction().empty()) {
+ return;
+ }
+
+ auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ *start_level = level_file.first;
+ *output_level =
+ (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+ if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+ return false;
+ }
+
+ start_level_inputs->files = {level_file.second};
+ start_level_inputs->level = *start_level;
+ return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+ };
+
+ // take a chance on a random file first
+ Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+ size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+ static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+
+ if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+ // found the compaction!
+ return;
+ }
+
+ for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+ start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+ VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index) {
+ // Two level 0 compaction won't run at the same time, so don't need to worry
+ // about files on level 0 being compacted.
+ assert(level0_compactions_in_progress()->empty());
+ InternalKey smallest, largest;
+ GetRange(*start_level_inputs, &smallest, &largest);
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ start_level_inputs->files.clear();
+ vstorage->GetOverlappingInputs(0, &smallest, &largest,
+ &(start_level_inputs->files));
+
+ // If we include more L0 files in the same compaction run it can
+ // cause the 'smallest' and 'largest' key to get extended to a
+ // larger range. So, re-invoke GetRange to get the new key range
+ GetRange(*start_level_inputs, &smallest, &largest);
+ if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+ parent_index)) {
+ return false;
+ }
+ assert(!start_level_inputs->files.empty());
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..36d570e68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,313 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+ CompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp);
+ virtual ~CompactionPicker();
+
+ // Pick level and inputs for a new compaction.
+ // Returns nullptr if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns nullptr if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ //
+ // The returned Compaction might not include the whole requested range.
+ // In that case, compaction_end will be set to the next key that needs
+ // compacting. In case the compaction will compact the whole range,
+ // compaction_end will be set to nullptr.
+ // Client is responsible for compaction_end storage -- when called,
+ // *compaction_end should point to valid InternalKey!
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore);
+
+ // The maximum allowed output level. Default value is NumberLevels() - 1.
+ virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files. If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+ Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta,
+ const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Free up the files that participated in a compaction
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Compaction* c, Status status);
+
+ // Returns true if any one of the specified files are being compacted
+ bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+ // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+ // object.
+ //
+ // Caller must provide a set of input files that has been passed through
+ // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+ // between that call and this one.
+ Compaction* CompactFiles(const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files,
+ int output_level, VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ uint32_t output_path_id);
+
+ // Converts a set of compaction input file numbers into
+ // a list of CompactionInputFiles.
+ Status GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set,
+ const VersionStorageInfo* vstorage,
+ const CompactionOptions& compact_options) const;
+
+ // Is there currently a compaction involving level 0 taking place
+ bool IsLevel0CompactionInProgress() const {
+ return !level0_compactions_in_progress_.empty();
+ }
+
+ // Return true if the passed key range overlap with a compaction output
+ // that is currently running.
+ bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level) const;
+
+ // Stores the minimal range that covers all entries in inputs in
+ // *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs1 and inputs2
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty (at least on entry have one file)
+ void GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest, InternalKey* largest) const;
+
+ int NumberLevels() const { return ioptions_.num_levels; }
+
+ // Add more files to the inputs on "level" to make sure that
+ // no newer version of a key is compacted to "level+1" while leaving an older
+ // version in a "level". Otherwise, any Get() will search "level" first,
+ // and will likely return an old/stale value for the key, since it always
+ // searches in increasing order of level to find the value. This could
+ // also scramble the order of merge operands. This function should be
+ // called any time a new Compaction is created, and its inputs_[0] are
+ // populated.
+ //
+ // Will return false if it is impossible to apply this compaction.
+ bool ExpandInputsToCleanCut(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest = nullptr);
+
+ // Returns true if any one of the parent files are being compacted
+ bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest, int level, int* index);
+
+ // Returns true if the key range that `inputs` files cover overlap with the
+ // key range of a currently running compaction.
+ bool FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level) const;
+
+ bool SetupOtherInputs(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs,
+ int* parent_index, int base_index);
+
+ void GetGrandparents(VersionStorageInfo* vstorage,
+ const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents);
+
+ void PickFilesMarkedForCompaction(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ int* start_level, int* output_level,
+ CompactionInputFiles* start_level_inputs);
+
+ bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+ CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index);
+
+ // Register this compaction in the set of running compactions
+ void RegisterCompaction(Compaction* c);
+
+ // Remove this compaction from the set of running compactions
+ void UnregisterCompaction(Compaction* c);
+
+ std::set<Compaction*>* level0_compactions_in_progress() {
+ return &level0_compactions_in_progress_;
+ }
+ std::unordered_set<Compaction*>* compactions_in_progress() {
+ return &compactions_in_progress_;
+ }
+
+ protected:
+ const ImmutableCFOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+ virtual Status SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Keeps track of all compactions that are running on Level0.
+ // Protected by DB mutex
+ std::set<Compaction*> level0_compactions_in_progress_;
+
+ // Keeps track of all compactions that are running.
+ // Protected by DB mutex
+ std::unordered_set<Compaction*> compactions_in_progress_;
+
+ const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+ NullCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual ~NullCompactionPicker() {}
+
+ // Always return "nullptr"
+ Compaction* PickCompaction(
+ const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+ SequenceNumber /* earliest_memtable_seqno */) override {
+ return nullptr;
+ }
+
+ // Always return "nullptr"
+ Compaction* CompactRange(const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ VersionStorageInfo* /*vstorage*/,
+ int /*input_level*/, int /*output_level*/,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/,
+ const InternalKey* /*end*/,
+ InternalKey** /*compaction_end*/,
+ bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/) override {
+ return nullptr;
+ }
+
+ // Always returns false.
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* /*vstorage*/) const override {
+ return false;
+ }
+};
+#endif // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files Metadata for L0 files.
+// @param min_files_to_compact Minimum number of files required to
+// do the compaction.
+// @param max_compact_bytes_per_del_file Maximum average size in bytes per
+// file that is going to get deleted by
+// the compaction.
+// @param max_compaction_bytes Maximum total size in bytes (in terms
+// of compensated file size) for files
+// to be compacted.
+// @param [out] comp_inputs If a compaction was found, will be
+// initialized with corresponding input
+// files. Cannot be nullptr.
+//
+// @return true iff compaction was found.
+bool FindIntraL0Compaction(
+ const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+ const VersionStorageInfo* vstorage,
+ int level,
+ const bool enable_compression = true);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..b148aadc2
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+ uint64_t total_size = 0;
+ for (const auto& f : files) {
+ total_size += f->fd.file_size;
+ }
+ return total_size;
+}
+} // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+ assert(mutable_cf_options.ttl > 0);
+
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+ uint64_t total_size = GetTotalFilesSize(level_files);
+
+ int64_t _current_time;
+ auto status = ioptions_.env->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: Couldn't get current time: %s. "
+ "Not doing compactions based on TTL. ",
+ cf_name.c_str(), status.ToString().c_str());
+ return nullptr;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ // avoid underflow
+ if (current_time > mutable_cf_options.ttl) {
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ FileMetaData* f = *ritr;
+ if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ uint64_t creation_time =
+ f->fd.table_reader->GetTableProperties()->creation_time;
+ if (creation_time == 0 ||
+ creation_time >= (current_time - mutable_cf_options.ttl)) {
+ break;
+ }
+ total_size -= f->compensated_file_size;
+ inputs[0].files.push_back(f);
+ }
+ }
+ }
+
+ // Return a nullptr and proceed to size-based FIFO compaction if:
+ // 1. there are no files older than ttl OR
+ // 2. there are a few files older than ttl, but deleting them will not bring
+ // the total size to be less than max_table_files_size threshold.
+ if (inputs[0].files.empty() ||
+ total_size >
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ return nullptr;
+ }
+
+ for (const auto& f : inputs[0].files) {
+ uint64_t creation_time = 0;
+ if (f && f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+ }
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with creation time %" PRIu64 " for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), creation_time);
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+ kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+ {}, /* is manual */ false, vstorage->CompactionScore(0),
+ /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+ uint64_t total_size = GetTotalFilesSize(level_files);
+
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size ||
+ level_files.size() == 0) {
+ // total size not exceeded
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+ level_files.size() > 0) {
+ CompactionInputFiles comp_inputs;
+ // try to prevent same files from being compacted multiple times, which
+ // could produce large files that may never TTL-expire. Achieve this by
+ // disallowing compactions with files larger than memtable (inflate its
+ // size by 10% to account for uncompressed L0 files that may have size
+ // slightly greater than memtable size limit).
+ size_t max_compact_bytes_per_del_file =
+ static_cast<size_t>(MultiplyCheckOverflow(
+ static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+ 1.1));
+ if (FindIntraL0Compaction(
+ level_files,
+ mutable_cf_options
+ .level0_file_num_compaction_trigger /* min_files_to_compact */
+ ,
+ max_compact_bytes_per_del_file,
+ mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
+ 16 * 1024 * 1024 /* output file size limit */,
+ 0 /* max compaction bytes, not applicable */,
+ 0 /* output path ID */, mutable_cf_options.compression,
+ ioptions_.compression_opts, 0 /* max_subcompactions */, {},
+ /* is manual */ false, vstorage->CompactionScore(0),
+ /* is deletion compaction */ false,
+ CompactionReason::kFIFOReduceNumFiles);
+ return c;
+ }
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+ ", max size %" PRIu64 "\n",
+ cf_name.c_str(), total_size,
+ mutable_cf_options.compaction_options_fifo.max_table_files_size);
+ return nullptr;
+ }
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ auto f = *ritr;
+ total_size -= f->compensated_file_size;
+ inputs[0].files.push_back(f);
+ char tmp_fsize[16];
+ AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with size %s for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ break;
+ }
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+ kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+ {}, /* is manual */ false, vstorage->CompactionScore(0),
+ /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber /*earliest_memtable_seqno*/) {
+ assert(vstorage->num_levels() == 1);
+
+ Compaction* c = nullptr;
+ if (mutable_cf_options.ttl > 0) {
+ c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+ }
+ if (c == nullptr) {
+ c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+ }
+ RegisterCompaction(c);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/, const InternalKey* /*end*/,
+ InternalKey** compaction_end, bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/) {
+#ifdef NDEBUG
+ (void)input_level;
+ (void)output_level;
+#endif
+ assert(input_level == 0);
+ assert(output_level == 0);
+ *compaction_end = nullptr;
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+ Compaction* c =
+ PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+ log_buffer.FlushBufferToLog();
+ return c;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..eb786e5ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+ FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* version, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore) override;
+
+ // The maximum allowed output level. Always returns 0.
+ virtual int MaxOutputLevel() const override { return 0; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+
+ private:
+ Compaction* PickTTLCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+
+ Compaction* PickSizeCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..012edd080
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,558 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/compaction/compaction_picker_level.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ if (!vstorage->ExpiredTtlFiles().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+ if (vstorage->CompactionScore(i) >= 1) {
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+ LevelCompactionBuilder(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ SequenceNumber earliest_mem_seqno,
+ CompactionPicker* compaction_picker,
+ LogBuffer* log_buffer,
+ const MutableCFOptions& mutable_cf_options,
+ const ImmutableCFOptions& ioptions)
+ : cf_name_(cf_name),
+ vstorage_(vstorage),
+ earliest_mem_seqno_(earliest_mem_seqno),
+ compaction_picker_(compaction_picker),
+ log_buffer_(log_buffer),
+ mutable_cf_options_(mutable_cf_options),
+ ioptions_(ioptions) {}
+
+ // Pick and return a compaction.
+ Compaction* PickCompaction();
+
+ // Pick the initial files to compact to the next level. (or together
+ // in Intra-L0 compactions)
+ void SetupInitialFiles();
+
+ // If the initial files are from L0 level, pick other L0
+ // files if needed.
+ bool SetupOtherL0FilesIfNeeded();
+
+ // Based on initial files, setup other files need to be compacted
+ // in this compaction, accordingly.
+ bool SetupOtherInputsIfNeeded();
+
+ Compaction* GetCompaction();
+
+ // For the specfied level, pick a file that we want to compact.
+ // Returns false if there is no file to compact.
+ // If it returns true, inputs->files.size() will be exactly one.
+ // If level is 0 and there is already a compaction on that level, this
+ // function will return false.
+ bool PickFileToCompact();
+
+ // For L0->L0, picks the longest span of files that aren't currently
+ // undergoing compaction for which work-per-deleted-file decreases. The span
+ // always starts from the newest L0 file.
+ //
+ // Intra-L0 compaction is independent of all other files, so it can be
+ // performed even when L0->base_level compactions are blocked.
+ //
+ // Returns true if `inputs` is populated with a span of files to be compacted;
+ // otherwise, returns false.
+ bool PickIntraL0Compaction();
+
+ void PickExpiredTtlFiles();
+
+ void PickFilesMarkedForPeriodicCompaction();
+
+ const std::string& cf_name_;
+ VersionStorageInfo* vstorage_;
+ SequenceNumber earliest_mem_seqno_;
+ CompactionPicker* compaction_picker_;
+ LogBuffer* log_buffer_;
+ int start_level_ = -1;
+ int output_level_ = -1;
+ int parent_index_ = -1;
+ int base_index_ = -1;
+ double start_level_score_ = 0;
+ bool is_manual_ = false;
+ CompactionInputFiles start_level_inputs_;
+ std::vector<CompactionInputFiles> compaction_inputs_;
+ CompactionInputFiles output_level_inputs_;
+ std::vector<FileMetaData*> grandparents_;
+ CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+ const MutableCFOptions& mutable_cf_options_;
+ const ImmutableCFOptions& ioptions_;
+ // Pick a path ID to place a newly generated file, with its level
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickExpiredTtlFiles() {
+ if (vstorage_->ExpiredTtlFiles().empty()) {
+ return;
+ }
+
+ auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ start_level_ = level_file.first;
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+
+ if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+ (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty())) {
+ return false;
+ }
+
+ start_level_inputs_.files = {level_file.second};
+ start_level_inputs_.level = start_level_;
+ return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_);
+ };
+
+ for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+
+ start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
+ if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+ return;
+ }
+
+ auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ output_level_ = start_level_ = level_file.first;
+
+ if (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty()) {
+ return false;
+ }
+
+ start_level_inputs_.files = {level_file.second};
+ start_level_inputs_.level = start_level_;
+ return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_);
+ };
+
+ for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+
+ start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+ // Find the compactions by size on all levels.
+ bool skipped_l0_to_base = false;
+ for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+ start_level_score_ = vstorage_->CompactionScore(i);
+ start_level_ = vstorage_->CompactionScoreLevel(i);
+ assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+ if (start_level_score_ >= 1) {
+ if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+ // If L0->base_level compaction is pending, don't schedule further
+ // compaction from base level. Otherwise L0->base_level compaction
+ // may starve.
+ continue;
+ }
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+ if (PickFileToCompact()) {
+ // found the compaction!
+ if (start_level_ == 0) {
+ // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ } else {
+ // L1+ score = `Level files size` / `MaxBytesForLevel`
+ compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+ }
+ break;
+ } else {
+ // didn't find the compaction, clear the inputs
+ start_level_inputs_.clear();
+ if (start_level_ == 0) {
+ skipped_l0_to_base = true;
+ // L0->base_level may be blocked due to ongoing L0->base_level
+ // compactions. It may also be blocked by an ongoing compaction from
+ // base_level downwards.
+ //
+ // In these cases, to reduce L0 file count and thus reduce likelihood
+ // of write stalls, we can attempt compacting a span of files within
+ // L0.
+ if (PickIntraL0Compaction()) {
+ output_level_ = 0;
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // if we didn't find a compaction, check if there are any files marked for
+ // compaction
+ if (start_level_inputs_.empty()) {
+ parent_index_ = base_index_ = -1;
+
+ compaction_picker_->PickFilesMarkedForCompaction(
+ cf_name_, vstorage_, &start_level_, &output_level_,
+ &start_level_inputs_);
+ if (!start_level_inputs_.empty()) {
+ is_manual_ = true;
+ compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+ return;
+ }
+ }
+
+ // Bottommost Files Compaction on deleting tombstones
+ if (start_level_inputs_.empty()) {
+ size_t i;
+ for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
+ ++i) {
+ auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
+ assert(!level_and_file.second->being_compacted);
+ start_level_inputs_.level = output_level_ = start_level_ =
+ level_and_file.first;
+ start_level_inputs_.files = {level_and_file.second};
+ if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_)) {
+ break;
+ }
+ }
+ if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
+ start_level_inputs_.clear();
+ } else {
+ assert(!start_level_inputs_.empty());
+ compaction_reason_ = CompactionReason::kBottommostFiles;
+ return;
+ }
+ }
+
+ // TTL Compaction
+ if (start_level_inputs_.empty()) {
+ PickExpiredTtlFiles();
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kTtl;
+ return;
+ }
+ }
+
+ // Periodic Compaction
+ if (start_level_inputs_.empty()) {
+ PickFilesMarkedForPeriodicCompaction();
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kPeriodicCompaction;
+ return;
+ }
+ }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+ if (start_level_ == 0 && output_level_ != 0) {
+ return compaction_picker_->GetOverlappingL0Files(
+ vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+ }
+ return true;
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+ // Setup input files from output level. For output to L0, we only compact
+ // spans of files that do not interact with any pending compactions, so don't
+ // need to consider other levels.
+ if (output_level_ != 0) {
+ output_level_inputs_.level = output_level_;
+ if (!compaction_picker_->SetupOtherInputs(
+ cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+ &output_level_inputs_, &parent_index_, base_index_)) {
+ return false;
+ }
+
+ compaction_inputs_.push_back(start_level_inputs_);
+ if (!output_level_inputs_.empty()) {
+ compaction_inputs_.push_back(output_level_inputs_);
+ }
+
+ // In some edge cases we could pick a compaction that will be compacting
+ // a key range that overlap with another running compaction, and both
+ // of them have the same output level. This could happen if
+ // (1) we are running a non-exclusive manual compaction
+ // (2) AddFile ingest a new file into the LSM tree
+ // We need to disallow this from happening.
+ if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
+ output_level_)) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ return false;
+ }
+ compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+ output_level_inputs_, &grandparents_);
+ } else {
+ compaction_inputs_.push_back(start_level_inputs_);
+ }
+ return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+ // Pick up the first file to start compaction. It may have been extended
+ // to a clean cut.
+ SetupInitialFiles();
+ if (start_level_inputs_.empty()) {
+ return nullptr;
+ }
+ assert(start_level_ >= 0 && output_level_ >= 0);
+
+ // If it is a L0 -> base level compaction, we need to set up other L0
+ // files if needed.
+ if (!SetupOtherL0FilesIfNeeded()) {
+ return nullptr;
+ }
+
+ // Pick files in the output level and expand more files in the start level
+ // if needed.
+ if (!SetupOtherInputsIfNeeded()) {
+ return nullptr;
+ }
+
+ // Form a compaction object containing the files we picked.
+ Compaction* c = GetCompaction();
+
+ TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+ return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+ auto c = new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
+ output_level_,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+ ioptions_.compaction_style, vstorage_->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options_.max_compaction_bytes,
+ GetPathId(ioptions_, mutable_cf_options_, output_level_),
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+ output_level_, vstorage_->base_level()),
+ GetCompressionOptions(ioptions_, vstorage_, output_level_),
+ /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+ start_level_score_, false /* deletion_compaction */, compaction_reason_);
+
+ // If it's level 0 compaction, make sure we don't execute any other level 0
+ // compactions in parallel
+ compaction_picker_->RegisterCompaction(c);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, int level) {
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+
+ // size remaining in the most recent path
+ uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+ uint64_t level_size;
+ int cur_level = 0;
+
+ // max_bytes_for_level_base denotes L1 size.
+ // We estimate L0 size to be the same as L1.
+ level_size = mutable_cf_options.max_bytes_for_level_base;
+
+ // Last path is the fallback
+ while (p < ioptions.cf_paths.size() - 1) {
+ if (level_size <= current_path_size) {
+ if (cur_level == level) {
+ // Does desired level fit in this path?
+ return p;
+ } else {
+ current_path_size -= level_size;
+ if (cur_level > 0) {
+ if (ioptions.level_compaction_dynamic_level_bytes) {
+ // Currently, level_compaction_dynamic_level_bytes is ignored when
+ // multiple db paths are specified. https://github.com/facebook/
+ // rocksdb/blob/master/db/column_family.cc.
+ // Still, adding this check to avoid accidentally using
+ // max_bytes_for_level_multiplier_additional
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+ } else {
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+ mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+ }
+ }
+ cur_level++;
+ continue;
+ }
+ }
+ p++;
+ current_path_size = ioptions.cf_paths[p].target_size;
+ }
+ return p;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+ // level 0 files are overlapping. So we cannot pick more
+ // than one concurrent compactions at this level. This
+ // could be made better by looking at key-ranges that are
+ // being compacted at level 0.
+ if (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty()) {
+ TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+ return false;
+ }
+
+ start_level_inputs_.clear();
+
+ assert(start_level_ >= 0);
+
+ // Pick the largest file in this level that is not already
+ // being compacted
+ const std::vector<int>& file_size =
+ vstorage_->FilesByCompactionPri(start_level_);
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+
+ unsigned int cmp_idx;
+ for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+ cmp_idx < file_size.size(); cmp_idx++) {
+ int index = file_size[cmp_idx];
+ auto* f = level_files[index];
+
+ // do not pick a file to compact if it is being compacted
+ // from n-1 level.
+ if (f->being_compacted) {
+ continue;
+ }
+
+ start_level_inputs_.files.push_back(f);
+ start_level_inputs_.level = start_level_;
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_) ||
+ compaction_picker_->FilesRangeOverlapWithCompaction(
+ {start_level_inputs_}, output_level_)) {
+ // A locked (pending compaction) input-level file was pulled in due to
+ // user-key overlap.
+ start_level_inputs_.clear();
+ continue;
+ }
+
+ // Now that input level is fully expanded, we check whether any output files
+ // are locked due to pending compaction.
+ //
+ // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+ // level files are locked, not just the extra ones pulled in for user-key
+ // overlap.
+ InternalKey smallest, largest;
+ compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (!output_level_inputs.empty() &&
+ !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &output_level_inputs)) {
+ start_level_inputs_.clear();
+ continue;
+ }
+ base_index_ = index;
+ break;
+ }
+
+ // store where to start the iteration in the next call to PickCompaction
+ vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+
+ return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+ start_level_inputs_.clear();
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(0 /* level */);
+ if (level_files.size() <
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+ level_files[0]->being_compacted) {
+ // If L0 isn't accumulating much files beyond the regular trigger, don't
+ // resort to L0->L0 compaction yet.
+ return false;
+ }
+ return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+ port::kMaxUint64,
+ mutable_cf_options_.max_compaction_bytes,
+ &start_level_inputs_, earliest_mem_seqno_);
+}
+} // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_mem_seqno) {
+ LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+ log_buffer, mutable_cf_options, ioptions_);
+ return builder.PickCompaction();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..b82070e14
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+ LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..278bdb06a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,1741 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+
+#include <limits>
+#include <string>
+#include <utility>
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+ using Logger::Logv;
+ void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+ size_t log_count;
+};
+
+class CompactionPickerTest : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ Options options_;
+ ImmutableCFOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ LevelCompactionPicker level_compaction_picker;
+ std::string cf_name_;
+ CountingLogger logger_;
+ LogBuffer log_buffer_;
+ uint32_t file_num_;
+ CompactionOptionsFIFO fifo_options_;
+ std::unique_ptr<VersionStorageInfo> vstorage_;
+ std::vector<std::unique_ptr<FileMetaData>> files_;
+ // does not own FileMetaData
+ std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+ // input files to compaction process.
+ std::vector<CompactionInputFiles> input_files_;
+ int compaction_level_start_;
+
+ CompactionPickerTest()
+ : ucmp_(BytewiseComparator()),
+ icmp_(ucmp_),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ level_compaction_picker(ioptions_, &icmp_),
+ cf_name_("dummy"),
+ log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+ file_num_(1),
+ vstorage_(nullptr) {
+ mutable_cf_options_.ttl = 0;
+ mutable_cf_options_.periodic_compaction_seconds = 0;
+ // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+ // tests to cover.
+ ioptions_.compaction_pri = kByCompensatedSize;
+ fifo_options_.max_table_files_size = 1;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ ioptions_.cf_paths.emplace_back("dummy",
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ ~CompactionPickerTest() override {}
+
+ void NewVersionStorage(int num_levels, CompactionStyle style) {
+ DeleteVersionStorage();
+ options_.num_levels = num_levels;
+ vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+ style, nullptr, false));
+ vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ }
+
+ void DeleteVersionStorage() {
+ vstorage_.reset();
+ files_.clear();
+ file_map_.clear();
+ input_files_.clear();
+ }
+
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+ SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+ size_t compensated_file_size = 0) {
+ assert(level < vstorage_->num_levels());
+ FileMetaData* f = new FileMetaData(
+ file_number, path_id, file_size,
+ InternalKey(smallest, smallest_seq, kTypeValue),
+ InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+ largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ f->compensated_file_size =
+ (compensated_file_size != 0) ? compensated_file_size : file_size;
+ vstorage_->AddFile(level, f);
+ files_.emplace_back(f);
+ file_map_.insert({file_number, {f, level}});
+ }
+
+ void SetCompactionInputFilesLevels(int level_count, int start_level) {
+ input_files_.resize(level_count);
+ for (int i = 0; i < level_count; ++i) {
+ input_files_[i].level = start_level + i;
+ }
+ compaction_level_start_ = start_level;
+ }
+
+ void AddToCompactionFiles(uint32_t file_number) {
+ auto iter = file_map_.find(file_number);
+ assert(iter != file_map_.end());
+ int level = iter->second.second;
+ assert(level < vstorage_->num_levels());
+ input_files_[level - compaction_level_start_].files.emplace_back(
+ iter->second.first);
+ }
+
+ void UpdateVersionStorageInfo() {
+ vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
+ vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+ vstorage_->UpdateNumNonEmptyLevels();
+ vstorage_->GenerateFileIndexer();
+ vstorage_->GenerateLevelFilesBrief();
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ vstorage_->GenerateLevel0NonOverlapping();
+ vstorage_->ComputeFilesMarkedForCompaction();
+ vstorage_->SetFinalized();
+ }
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ UpdateVersionStorageInfo();
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "p", "q");
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+ mutable_cf_options_.target_file_size_base = 10000000000;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000001U);
+ Add(1, 88U, "201", "300", 1000000000U);
+ Add(2, 6U, "150", "179", 1000000000U);
+ Add(2, 7U, "180", "220", 1000000000U);
+ Add(2, 8U, "221", "300", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ Add(0, 1U, "150", "200", 1000000U);
+ // Level 1 score 1.2
+ Add(1, 66U, "150", "200", 6000000U);
+ Add(1, 88U, "201", "300", 6000000U);
+ // Level 2 score 1.8. File 7 is the largest. Should be picked
+ Add(2, 6U, "150", "179", 60000000U);
+ Add(2, 7U, "180", "220", 60000001U);
+ Add(2, 8U, "221", "300", 60000000U);
+ // Level 3 score slightly larger than 1
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+ mutable_cf_options_.target_file_size_base / 10,
+ compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+ const int kLevels = 6;
+ const int kFileCount = 20;
+
+ for (int level = 0; level < kLevels - 1; ++level) {
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+ for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+ // start a brand new version in each test.
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ for (int i = 0; i < file_count; ++i) {
+ Add(level, i, ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(),
+ file_size, 0, i * 100, i * 100 + 99);
+ }
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ // release the version storage
+ DeleteVersionStorage();
+ }
+ }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+ Add(num_levels - 3, 5U, "150", "180", 3U);
+ Add(num_levels - 3, 6U, "181", "300", 3U);
+ Add(num_levels - 3, 7U, "400", "450", 3U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(num_levels - 3, compaction->level(1));
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+ Add(num_levels - 1, 4U, "400", "450", 3U);
+ Add(num_levels - 2, 5U, "150", "180", 300U);
+ Add(num_levels - 2, 6U, "181", "350", 500U);
+ Add(num_levels - 2, 7U, "400", "450", 200U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(0, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(
+ ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ // verify the trigger given different number of L0 files.
+ for (int i = 1;
+ i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ Add(0, i, ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+ i * 100 + 99);
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+ const uint64_t kFileSize = 100000;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ ioptions_.allow_ingest_behind = true;
+ ioptions_.num_levels = 3;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ // output level should be the one above the bottom-most
+ ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+ // The case where universal periodic compaction can be picked
+ // with some newer files being compacted.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+ // The case where universal periodic compaction does not
+ // pick up only level to compact if it doesn't cover
+ // any file marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+ // The case where universal periodic compaction does not
+ // pick up only the last sorted run which is an L0 file if it isn't
+ // marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+ // The case where universal periodic compaction couldn't form
+ // a compaction that inlcudes any file marked for periodic compaction.
+ // Right now we form the compaction anyway if it is more than one
+ // sorted run. Just put the case here to validate that it doesn't
+ // crash.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(!compaction ||
+ compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+ // Test single L0 file periodic compaction triggering.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+ // Test single sorted run non-L0 periodic compaction
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+ Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const int kFileCount =
+ mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+ // verify whether compaction is needed based on the current
+ // size of L0 files.
+ uint64_t current_size = 0;
+ for (int i = 1; i <= kFileCount; ++i) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ Add(0, i, ToString((i + 100) * 1000).c_str(),
+ ToString((i + 100) * 1000 + 999).c_str(),
+ kFileSize, 0, i * 100, i * 100 + 99);
+ current_size += kFileSize;
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+#endif // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 100000000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+ Add(2, 6U, "150", "179", 50000000U);
+ Add(2, 7U, "180", "220", 50000000U);
+ Add(2, 8U, "321", "400", 50000000U); // File not overlapping
+ Add(2, 9U, "721", "800", 50000000U);
+
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ Add(3, 30U, "750", "900", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Pick file 8 because it overlaps with 0 files on level 3.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+ // Compaction input size * 1.1
+ ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+ Add(2, 6U, "150", "175",
+ 60000000U); // Overlaps with file 26, 27, total size 521M
+ Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size
+ // 520M, the smalelst overlapping
+ Add(2, 8U, "201", "300",
+ 60000000U); // Overlaps with file 28, 29, total size 521M
+
+ Add(3, 26U, "100", "110", 261000000U);
+ Add(3, 26U, "150", "170", 261000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 261000000U);
+ Add(3, 30U, "321", "400", 261000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 7 because overlapping ratio is the biggest.
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file
+ // itself is larger. Should be picked.
+
+ Add(3, 26U, "160", "165", 260000000U);
+ Add(3, 27U, "166", "170", 260000000U);
+ Add(3, 28U, "180", "400", 260000000U);
+ Add(3, 29U, "401", "500", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ // Overlaps with file 26, 27. And the file is compensated so will be
+ // picked up.
+ Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28
+
+ Add(3, 26U, "160", "165", 60000000U);
+ // Boosted file size in output level is not considered.
+ Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+ Add(3, 28U, "180", "400", 60000000U);
+ Add(3, 29U, "401", "500", 60000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+ int num_levels = ioptions_.num_levels;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200"); // <- marked for compaction
+ Add(1, 3U, "400", "500", 600); // <- this one needs compacting
+ Add(2, 4U, "150", "200");
+ Add(2, 5U, "201", "210");
+ Add(2, 6U, "300", "310");
+ Add(2, 7U, "400", "500"); // <- being compacted
+
+ vstorage_->LevelFiles(2)[3]->being_compacted = true;
+ vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kByCompensatedSize;
+
+ Add(1, 1U, "100", "150", 1U);
+ // Overlapping user keys
+ Add(1, 2U, "200", "400", 1U);
+ Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+ Add(2, 4U, "600", "700", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+ // expand multiple times)
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "200", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+ Add(1, 4U, "250", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "199", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1100000U, 0, 0);
+ Add(1, 4U, "251", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+
+ Add(2, 6U, "100", "115", 1U);
+ Add(2, 7U, "125", "325", 1U);
+ Add(2, 8U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "300", 1U, 0, 0);
+ Add(2, 5U, "305", "450", 1U, 0, 0);
+ Add(2, 6U, "460", "600", 1U, 0, 0);
+ Add(2, 7U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+ vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+ Add(2, 3U, "100", "250", 1U);
+ Add(2, 4U, "300", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "800", 1U, 0, 0);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_GE(1U, compaction->num_input_files(0));
+ ASSERT_GE(2U, compaction->num_input_files(1));
+ // File 5 has to be included in the compaction
+ ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // no overlapping case
+ Add(1, 1U, "101", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "150", "200", 1U);
+ Add(2, 7U, "200", "450", 1U, 0, 0);
+ Add(2, 8U, "500", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // overlapping case
+ Add(1, 1U, "121", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "100", "120", 1U);
+ Add(2, 7U, "150", "200", 1U);
+ Add(2, 8U, "200", "450", 1U, 0, 0);
+ Add(2, 9U, "501", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+ // Locked file encountered when pulling in extra input-level files with same
+ // user keys. Verify we pick the next-best file from the same input level.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // file_number 2U is largest and thus first choice. But it overlaps with
+ // file_number 1U which is being compacted. So instead we pick the next-
+ // biggest file, 3U, which is eligible for compaction.
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 900000000U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+ // Locked file encountered when pulling in extra output-level files with same
+ // user keys. Expected to skip that compaction and pick the next-best choice.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // score(L1) = 3.7
+ // score(L2) = 1.85
+ // There is no eligible file in L1 to compact since both candidates pull in
+ // file_number 5U, which overlaps with a file pending compaction (6U). The
+ // first eligible compaction is from L2->L3.
+ Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 5000000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "201" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ file_map_[6U].first->being_compacted = true;
+ Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // No compaction should be scheduled, if L0 has higher priority than L1
+ // but L0->L1 compaction is blocked by a file in L1 being compacted.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+ UpdateVersionStorageInfo(); // being_compacted flag is cleared here.
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 score more than 6.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+ Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If score in L1 is larger than L0, L1 compaction goes through despite
+ // there is pending L0 compaction.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 3U, "150", "200", 200);
+ // Level 1 is over target by 200
+ Add(1, 4U, "400", "500", 600);
+ Add(1, 5U, "600", "700", 600);
+ // Level 2 is less than target 10000 even added size of level 1
+ // Size ratio of L2/L1 is 9600 / 1200 = 8
+ Add(2, 6U, "150", "200", 2500);
+ Add(2, 7U, "201", "210", 2000);
+ Add(2, 8U, "300", "310", 2600);
+ Add(2, 9U, "400", "500", 2500);
+ // Level 3 exceeds target 100,000 of 1000
+ Add(3, 10U, "400", "500", 101000);
+ // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+ // Size ratio L4/L3 is 9.9
+ // After merge from L3, L4 size is 1000900
+ Add(4, 11U, "400", "500", 999900);
+ Add(5, 11U, "400", "500", 8007200);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // Level 1 size will be 1400 after merging with L0
+ Add(1, 7U, "400", "500", 200);
+ Add(1, 8U, "600", "700", 200);
+ // Level 2 is less than target 10000 even added size of level 1
+ Add(2, 9U, "150", "200", 9100);
+ // Level 3 over the target, but since level 4 is empty, we assume it will be
+ // a trivial move.
+ Add(3, 10U, "400", "500", 101000);
+
+ UpdateVersionStorageInfo();
+
+ // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+ ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 2000);
+ Add(0, 2U, "150", "200", 2000);
+ Add(0, 4U, "150", "200", 2000);
+ Add(0, 5U, "150", "200", 2000);
+ Add(0, 6U, "150", "200", 1000);
+ // Level 1 size will be 10000 after merging with L0
+ Add(1, 7U, "400", "500", 500);
+ Add(1, 8U, "600", "700", 500);
+
+ Add(2, 9U, "150", "200", 10000);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ // Set Last level size 50000
+ // num_levels - 1 target 5000
+ // num_levels - 2 is base level with target 1000 (rounded up to
+ // max_bytes_for_level_base).
+ Add(num_levels - 1, 10U, "400", "500", 50000);
+
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // num_levels - 3 is over target by 100 + 1000
+ Add(num_levels - 3, 7U, "400", "500", 550);
+ Add(num_levels - 3, 8U, "600", "700", 550);
+ // num_levels - 2 is over target by 1100 + 200
+ Add(num_levels - 2, 9U, "150", "200", 5200);
+
+ UpdateVersionStorageInfo();
+
+ // Merging to the second last level: (5200 / 2100 + 1) * 1100
+ // Merging to the last level: (50000 / 6300 + 1) * 1300
+ ASSERT_EQ(2100u + 3823u + 11617u,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+ // case 1: Higher levels are empty
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ bool result =
+ Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 2: Higher levels have no overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "k", "p");
+ Add(3, 8U, "t", "w");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 3.1: Higher levels (level 3) have overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "e", "g");
+ Add(3, 8U, "h", "k");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.2: Higher levels (level 5) have overlap
+ DeleteVersionStorage();
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "h", "k");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+ // one key ("d")
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "ccc", "d");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "z");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files don't overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // Level 1 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(1, 5U, "a", "m");
+ Add(1, 6U, "n", "o");
+ Add(1, 7U, "w", "y");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ AddToCompactionFiles(5U);
+ AddToCompactionFiles(6U);
+ AddToCompactionFiles(7U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 800000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // It can expand because adding file 1 and 3, the compaction size will
+ // exceed mutable_cf_options_.max_bytes_for_level_base.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "100", "256", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 800000u;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // and it expands to file 1 and 3 too.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "000", "251", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000u;
+ mutable_cf_options_.max_compaction_bytes = 10001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "100", "150", 3000U);
+ Add(1, 2U, "151", "200", 3001U);
+ Add(1, 3U, "201", "250", 3000U);
+ Add(1, 4U, "251", "300", 3000U);
+
+ Add(3, 5U, "120", "130", 7000U);
+ Add(3, 6U, "170", "180", 7000U);
+ Add(3, 5U, "220", "230", 7000U);
+ Add(3, 5U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 10000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick all files from level 1
+ Add(1, 1U, "100", "150", 300000U, 0, 0);
+ Add(1, 2U, "150", "200", 300000U, 0, 0);
+ Add(1, 3U, "200", "250", 300000U, 0, 0);
+ Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+ Add(3, 5U, "120", "130", 6000U);
+ Add(3, 6U, "140", "150", 6000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1000000000U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 900000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+ "249" /* largest */, 800000000U /* file_size */);
+ Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+ "299" /* largest */, 700000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 1U /* file_size */);
+ file_map_[5U].first->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(0U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(0U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+ ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+ // spans entire L0 key range and is marked as being compacted to avoid
+ // L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+ // max_compaction_bytes limit (the minimum number of files for triggering
+ // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+ // is marked as being compacted to avoid L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+ // being_compact limit. And the latest one L0 will be skipped due to earliest
+ // seqno. The one L1 file spans entire L0 key range and is marked as being
+ // compacted to avoid L0->L1 compaction.
+ Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+ Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+ Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+ Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+ Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+ Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+ Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+ vstorage_->LevelFiles(0)[5]->being_compacted = true;
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..d8b63956e
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1105 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+ UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp,
+ const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage,
+ UniversalCompactionPicker* picker,
+ LogBuffer* log_buffer)
+ : ioptions_(ioptions),
+ icmp_(icmp),
+ cf_name_(cf_name),
+ mutable_cf_options_(mutable_cf_options),
+ vstorage_(vstorage),
+ picker_(picker),
+ log_buffer_(log_buffer) {}
+
+ // Form and return the compaction object. The caller owns return object.
+ Compaction* PickCompaction();
+
+ private:
+ struct SortedRun {
+ SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+ uint64_t _compensated_file_size, bool _being_compacted)
+ : level(_level),
+ file(_file),
+ size(_size),
+ compensated_file_size(_compensated_file_size),
+ being_compacted(_being_compacted) {
+ assert(compensated_file_size > 0);
+ assert(level != 0 || file != nullptr);
+ }
+
+ void Dump(char* out_buf, size_t out_buf_size,
+ bool print_path = false) const;
+
+ // sorted_run_count is added into the string to print
+ void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+ size_t sorted_run_count) const;
+
+ int level;
+ // `file` Will be null for level > 0. For level = 0, the sorted run is
+ // for this file.
+ FileMetaData* file;
+ // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+ // files in the level. `being_compacted` should be the same for all files
+ // in a non-zero level. Use the value here.
+ uint64_t size;
+ uint64_t compensated_file_size;
+ bool being_compacted;
+ };
+
+ // Pick Universal compaction to limit read amplification
+ Compaction* PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+ // Pick Universal compaction to limit space amplification.
+ Compaction* PickCompactionToReduceSizeAmp();
+
+ Compaction* PickDeleteTriggeredCompaction();
+
+ // Form a compaction from the sorted run indicated by start_index to the
+ // oldest sorted run.
+ // The caller is responsible for making sure that those files are not in
+ // compaction.
+ Compaction* PickCompactionToOldest(size_t start_index,
+ CompactionReason compaction_reason);
+
+ // Try to pick periodic compaction. The caller should only call it
+ // if there is at least one file marked for periodic compaction.
+ // null will be returned if no such a compaction can be formed
+ // because some files are being compacted.
+ Compaction* PickPeriodicCompaction();
+
+ // Used in universal compaction when the enabled_trivial_move
+ // option is set. Checks whether there are any overlapping files
+ // in the input. Returns true if the input files are non
+ // overlapping.
+ bool IsInputFilesNonOverlapping(Compaction* c);
+
+ const ImmutableCFOptions& ioptions_;
+ const InternalKeyComparator* icmp_;
+ double score_;
+ std::vector<SortedRun> sorted_runs_;
+ const std::string& cf_name_;
+ const MutableCFOptions& mutable_cf_options_;
+ VersionStorageInfo* vstorage_;
+ UniversalCompactionPicker* picker_;
+ LogBuffer* log_buffer_;
+
+ static std::vector<SortedRun> CalculateSortedRuns(
+ const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+
+ // Pick a path ID to place a newly generated file, with its estimated file
+ // size.
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+ InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+ FileMetaData* f;
+ size_t level;
+ size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+ explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+ bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+ return (ucmp_->Compare(i1.f->smallest.user_key(),
+ i2.f->smallest.user_key()) > 0);
+ }
+
+ private:
+ const Comparator* ucmp_;
+};
+
+typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+ SmallestKeyHeapComparator>
+ SmallestKeyHeap;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+ SmallestKeyHeap smallest_key_priority_q =
+ SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+ InputFileInfo input_file;
+
+ for (size_t l = 0; l < c->num_input_levels(); l++) {
+ if (c->num_input_files(l) != 0) {
+ if (l == 0 && c->start_level() == 0) {
+ for (size_t i = 0; i < c->num_input_files(0); i++) {
+ input_file.f = c->input(0, i);
+ input_file.level = 0;
+ input_file.index = i;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ } else {
+ input_file.f = c->input(l, 0);
+ input_file.level = l;
+ input_file.index = 0;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ }
+ }
+ return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+ SequenceNumber* smallest_seqno,
+ SequenceNumber* largest_seqno) {
+ bool is_first = true;
+ for (FileMetaData* f : files) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ *smallest_seqno = f->fd.smallest_seqno;
+ *largest_seqno = f->fd.largest_seqno;
+ } else {
+ if (f->fd.smallest_seqno < *smallest_seqno) {
+ *smallest_seqno = f->fd.smallest_seqno;
+ }
+ if (f->fd.largest_seqno > *largest_seqno) {
+ *largest_seqno = f->fd.largest_seqno;
+ }
+ }
+ }
+}
+#endif
+} // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+ auto comparator = icmp_->user_comparator();
+ int first_iter = 1;
+
+ InputFileInfo prev, curr, next;
+
+ SmallestKeyHeap smallest_key_priority_q =
+ create_level_heap(c, icmp_->user_comparator());
+
+ while (!smallest_key_priority_q.empty()) {
+ curr = smallest_key_priority_q.top();
+ smallest_key_priority_q.pop();
+
+ if (first_iter) {
+ prev = curr;
+ first_iter = 0;
+ } else {
+ if (comparator->Compare(prev.f->largest.user_key(),
+ curr.f->smallest.user_key()) >= 0) {
+ // found overlapping files, return false
+ return false;
+ }
+ assert(comparator->Compare(curr.f->largest.user_key(),
+ prev.f->largest.user_key()) > 0);
+ prev = curr;
+ }
+
+ next.f = nullptr;
+
+ if (c->level(curr.level) != 0 &&
+ curr.index < c->num_input_files(curr.level) - 1) {
+ next.f = c->input(curr.level, curr.index + 1);
+ next.level = curr.level;
+ next.index = curr.index + 1;
+ }
+
+ if (next.f) {
+ smallest_key_priority_q.push(std::move(next));
+ }
+ }
+ return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ if (vstorage->CompactionScore(kLevel0) >= 1) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber /* earliest_memtable_seqno */) {
+ UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+ mutable_cf_options, vstorage, this,
+ log_buffer);
+ return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+ size_t out_buf_size,
+ bool print_path) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ if (file->fd.GetPathId() == 0 || !print_path) {
+ snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+ } else {
+ snprintf(out_buf, out_buf_size, "file %" PRIu64
+ "(path "
+ "%" PRIu32 ")",
+ file->fd.GetNumber(), file->fd.GetPathId());
+ }
+ } else {
+ snprintf(out_buf, out_buf_size, "level %d", level);
+ }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+ char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ snprintf(out_buf, out_buf_size,
+ "file %" PRIu64 "[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+ file->compensated_file_size);
+ } else {
+ snprintf(out_buf, out_buf_size,
+ "level %d[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ level, sorted_run_count, size, compensated_file_size);
+ }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+ const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
+ const MutableCFOptions& mutable_cf_options) {
+ std::vector<UniversalCompactionBuilder::SortedRun> ret;
+ for (FileMetaData* f : vstorage.LevelFiles(0)) {
+ ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+ f->being_compacted);
+ }
+ for (int level = 1; level < vstorage.num_levels(); level++) {
+ uint64_t total_compensated_size = 0U;
+ uint64_t total_size = 0U;
+ bool being_compacted = false;
+ bool is_first = true;
+ for (FileMetaData* f : vstorage.LevelFiles(level)) {
+ total_compensated_size += f->compensated_file_size;
+ total_size += f->fd.GetFileSize();
+ if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
+ true) {
+ if (f->being_compacted) {
+ being_compacted = f->being_compacted;
+ }
+ } else {
+ // Compaction always includes all files for a non-zero level, so for a
+ // non-zero level, all the files should share the same being_compacted
+ // value.
+ // This assumption is only valid when
+ // mutable_cf_options.compaction_options_universal.allow_trivial_move
+ // is false
+ assert(is_first || f->being_compacted == being_compacted);
+ }
+ if (is_first) {
+ being_compacted = f->being_compacted;
+ is_first = false;
+ }
+ }
+ if (total_compensated_size > 0) {
+ ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+ being_compacted);
+ }
+ }
+ return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+ const int kLevel0 = 0;
+ score_ = vstorage_->CompactionScore(kLevel0);
+ sorted_runs_ =
+ CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
+
+ if (sorted_runs_.size() == 0 ||
+ (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+ vstorage_->FilesMarkedForCompaction().empty() &&
+ sorted_runs_.size() < (unsigned int)mutable_cf_options_
+ .level0_file_num_compaction_trigger)) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+ cf_name_.c_str());
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER_MAX_SZ(
+ log_buffer_, 3072,
+ "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
+ cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+ Compaction* c = nullptr;
+ // Periodic compaction has higher priority than other type of compaction
+ // because it's a hard requirement.
+ if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+ // Always need to do a full compaction for periodic compaction.
+ c = PickPeriodicCompaction();
+ }
+
+ // Check for size amplification.
+ if (c == nullptr &&
+ sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger)) {
+ if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification is within limits. Try reducing read
+ // amplification while maintaining file size ratios.
+ unsigned int ratio =
+ mutable_cf_options_.compaction_options_universal.size_ratio;
+
+ if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for size ratio\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification and file size ratios are within configured limits.
+ // If max read amplification is exceeding configured limits, then force
+ // compaction without looking at filesize ratios and try to reduce
+ // the number of files to fewer than level0_file_num_compaction_trigger.
+ // This is guaranteed by NeedsCompaction()
+ assert(sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger));
+ // Get the total number of sorted runs that are not being compacted
+ int num_sr_not_compacted = 0;
+ for (size_t i = 0; i < sorted_runs_.size(); i++) {
+ if (sorted_runs_[i].being_compacted == false) {
+ num_sr_not_compacted++;
+ }
+ }
+
+ // The number of sorted runs that are not being compacted is greater
+ // than the maximum allowed number of sorted runs
+ if (num_sr_not_compacted >
+ mutable_cf_options_.level0_file_num_compaction_trigger) {
+ unsigned int num_files =
+ num_sr_not_compacted -
+ mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+ if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+ nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for file num -- %u\n",
+ cf_name_.c_str(), num_files);
+ }
+ }
+ }
+ }
+ }
+
+ if (c == nullptr) {
+ if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: delete triggered compaction\n",
+ cf_name_.c_str());
+ }
+ }
+
+ if (c == nullptr) {
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+
+ if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+ true &&
+ c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+ c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+ }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+ SequenceNumber prev_smallest_seqno = 0U;
+ bool is_first = true;
+
+ size_t level_index = 0U;
+ if (c->start_level() == 0) {
+ for (auto f : *c->inputs(0)) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ }
+ prev_smallest_seqno = f->fd.smallest_seqno;
+ }
+ level_index = 1U;
+ }
+ for (; level_index < c->num_input_levels(); level_index++) {
+ if (c->num_input_files(level_index) != 0) {
+ SequenceNumber smallest_seqno = 0U;
+ SequenceNumber largest_seqno = 0U;
+ GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+ &largest_seqno);
+ if (is_first) {
+ is_first = false;
+ } else if (prev_smallest_seqno > 0) {
+ // A level is considered as the bottommost level if there are
+ // no files in higher levels or if files in higher levels do
+ // not overlap with the files being compacted. Sequence numbers
+ // of files in bottommost level can be set to 0 to help
+ // compression. As a result, the following assert may not hold
+ // if the prev_smallest_seqno is 0.
+ assert(prev_smallest_seqno > largest_seqno);
+ }
+ prev_smallest_seqno = smallest_seqno;
+ }
+ }
+#endif
+ // update statistics
+ RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
+ c->inputs(0)->size());
+
+ picker_->RegisterCompaction(c);
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+ TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+ c);
+ return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+ // Two conditions need to be satisfied:
+ // (1) the target path needs to be able to hold the file's size
+ // (2) Total size left in this and previous paths need to be not
+ // smaller than expected future file size before this new file is
+ // compacted, which is estimated based on size_ratio.
+ // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+ // we will make sure the target file, probably with size of 16, will be
+ // placed in a path so that eventually when new files are generated and
+ // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+ // before the path we chose.
+ //
+ // TODO(sdong): now the case of multiple column families is not
+ // considered in this algorithm. So the target size can be violated in
+ // that case. We need to improve it.
+ uint64_t accumulated_size = 0;
+ uint64_t future_size =
+ file_size *
+ (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+ for (; p < ioptions.cf_paths.size() - 1; p++) {
+ uint64_t target_size = ioptions.cf_paths[p].target_size;
+ if (target_size > file_size &&
+ accumulated_size + (target_size - file_size) > future_size) {
+ return p;
+ }
+ accumulated_size += target_size;
+ }
+ return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+ unsigned int min_merge_width =
+ mutable_cf_options_.compaction_options_universal.min_merge_width;
+ unsigned int max_merge_width =
+ mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+ const SortedRun* sr = nullptr;
+ bool done = false;
+ size_t start_index = 0;
+ unsigned int candidate_count = 0;
+
+ unsigned int max_files_to_compact =
+ std::min(max_merge_width, max_number_of_files_to_compact);
+ min_merge_width = std::max(min_merge_width, 2U);
+
+ // Caller checks the size before executing this function. This invariant is
+ // important because otherwise we may have a possible integer underflow when
+ // dealing with unsigned types.
+ assert(sorted_runs_.size() > 0);
+
+ // Considers a candidate file only if it is smaller than the
+ // total size accumulated so far.
+ for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+ candidate_count = 0;
+
+ // Skip files that are already being compacted
+ for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+ sr = &sorted_runs_[loop];
+
+ if (!sr->being_compacted) {
+ candidate_count = 1;
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf));
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: %s"
+ "[%d] being compacted, skipping",
+ cf_name_.c_str(), file_num_buf, loop);
+
+ sr = nullptr;
+ }
+
+ // This file is not being compacted. Consider it as the
+ // first candidate to be compacted.
+ uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+ if (sr != nullptr) {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Possible candidate %s[%d].",
+ cf_name_.c_str(), file_num_buf, loop);
+ }
+
+ // Check if the succeeding files need compaction.
+ for (size_t i = loop + 1;
+ candidate_count < max_files_to_compact && i < sorted_runs_.size();
+ i++) {
+ const SortedRun* succeeding_sr = &sorted_runs_[i];
+ if (succeeding_sr->being_compacted) {
+ break;
+ }
+ // Pick files if the total/last candidate file size (increased by the
+ // specified ratio) is still larger than the next candidate file.
+ // candidate_size is the total size of files picked so far with the
+ // default kCompactionStopStyleTotalSize; with
+ // kCompactionStopStyleSimilarSize, it's simply the size of the last
+ // picked file.
+ double sz = candidate_size * (100.0 + ratio) / 100.0;
+ if (sz < static_cast<double>(succeeding_sr->size)) {
+ break;
+ }
+ if (mutable_cf_options_.compaction_options_universal.stop_style ==
+ kCompactionStopStyleSimilarSize) {
+ // Similar-size stopping rule: also check the last picked file isn't
+ // far larger than the next candidate file.
+ sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+ if (sz < static_cast<double>(candidate_size)) {
+ // If the small file we've encountered begins a run of similar-size
+ // files, we'll pick them up on a future iteration of the outer
+ // loop. If it's some lonely straggler, it'll eventually get picked
+ // by the last-resort read amp strategy which disregards size ratios.
+ break;
+ }
+ candidate_size = succeeding_sr->compensated_file_size;
+ } else { // default kCompactionStopStyleTotalSize
+ candidate_size += succeeding_sr->compensated_file_size;
+ }
+ candidate_count++;
+ }
+
+ // Found a series of consecutive files that need compaction.
+ if (candidate_count >= (unsigned int)min_merge_width) {
+ start_index = loop;
+ done = true;
+ break;
+ } else {
+ for (size_t i = loop;
+ i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+ const SortedRun* skipping_sr = &sorted_runs_[i];
+ char file_num_buf[256];
+ skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+ }
+ }
+ if (!done || candidate_count <= 1) {
+ return nullptr;
+ }
+ size_t first_index_after = start_index + candidate_count;
+ // Compression is enabled if files compacted earlier already reached
+ // size ratio of compression.
+ bool enable_compression = true;
+ int ratio_to_compress =
+ mutable_cf_options_.compaction_options_universal.compression_size_percent;
+ if (ratio_to_compress >= 0) {
+ uint64_t total_size = 0;
+ for (auto& sorted_run : sorted_runs_) {
+ total_size += sorted_run.compensated_file_size;
+ }
+
+ uint64_t older_file_size = 0;
+ for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+ older_file_size += sorted_runs_[i].size;
+ if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+ enable_compression = false;
+ break;
+ }
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ for (unsigned int i = 0; i < first_index_after; i++) {
+ estimated_total_size += sorted_runs_[i].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+ int output_level;
+ if (first_index_after == sorted_runs_.size()) {
+ output_level = vstorage_->num_levels() - 1;
+ } else if (sorted_runs_[first_index_after].level == 0) {
+ output_level = 0;
+ } else {
+ output_level = sorted_runs_[first_index_after].level - 1;
+ }
+
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind &&
+ (output_level == vstorage_->num_levels() - 1)) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t i = start_index; i < first_index_after; i++) {
+ auto& picking_sr = sorted_runs_[i];
+ if (picking_sr.level == 0) {
+ FileMetaData* picking_file = picking_sr.file;
+ inputs[0].files.push_back(picking_file);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+
+ CompactionReason compaction_reason;
+ if (max_number_of_files_to_compact == UINT_MAX) {
+ compaction_reason = CompactionReason::kUniversalSizeRatio;
+ } else {
+ compaction_reason = CompactionReason::kUniversalSortedRunNum;
+ }
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ LLONG_MAX, path_id,
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+ 1, enable_compression),
+ GetCompressionOptions(ioptions_, vstorage_, start_level,
+ enable_compression),
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ score_, false /* deletion_compaction */, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+ // percentage flexibility while reducing size amplification
+ uint64_t ratio = mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent;
+
+ unsigned int candidate_count = 0;
+ uint64_t candidate_size = 0;
+ size_t start_index = 0;
+ const SortedRun* sr = nullptr;
+
+ assert(!sorted_runs_.empty());
+ if (sorted_runs_.back().being_compacted) {
+ return nullptr;
+ }
+
+ // Skip files that are already being compacted
+ for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
+ sr = &sorted_runs_[loop];
+ if (!sr->being_compacted) {
+ start_index = loop; // Consider this as the first candidate.
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: skipping %s[%d] compacted %s",
+ cf_name_.c_str(), file_num_buf, loop,
+ " cannot be a candidate to reduce size amp.\n");
+ sr = nullptr;
+ }
+
+ if (sr == nullptr) {
+ return nullptr; // no candidate files
+ }
+ {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+ cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+ }
+
+ // keep adding up all the remaining files
+ for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
+ sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+ cf_name_.c_str(), file_num_buf, start_index,
+ " is already being compacted. No size amp reduction possible.\n");
+ return nullptr;
+ }
+ candidate_size += sr->compensated_file_size;
+ candidate_count++;
+ }
+ if (candidate_count == 0) {
+ return nullptr;
+ }
+
+ // size of earliest file
+ uint64_t earliest_file_size = sorted_runs_.back().size;
+
+ // size amplification = percentage of additional size
+ if (candidate_size * 100 < ratio * earliest_file_size) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, earliest_file_size);
+ return nullptr;
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, earliest_file_size);
+ }
+ return PickCompactionToOldest(start_index,
+ CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+ CompactionInputFiles start_level_inputs;
+ int output_level;
+ std::vector<CompactionInputFiles> inputs;
+
+ if (vstorage_->num_levels() == 1) {
+ // This is single level universal. Since we're basically trying to reclaim
+ // space by processing files marked for compaction due to high tombstone
+ // density, let's do the same thing as compaction to reduce size amp which
+ // has the same goals.
+ bool compact = false;
+
+ start_level_inputs.level = 0;
+ start_level_inputs.files.clear();
+ output_level = 0;
+ for (FileMetaData* f : vstorage_->LevelFiles(0)) {
+ if (f->marked_for_compaction) {
+ compact = true;
+ }
+ if (compact) {
+ start_level_inputs.files.push_back(f);
+ }
+ }
+ if (start_level_inputs.size() <= 1) {
+ // If only the last file in L0 is marked for compaction, ignore it
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ } else {
+ int start_level;
+
+ // For multi-level universal, the strategy is to make this look more like
+ // leveled. We pick one of the files marked for compaction and compact with
+ // overlapping files in the adjacent level.
+ picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+ &output_level, &start_level_inputs);
+ if (start_level_inputs.empty()) {
+ return nullptr;
+ }
+
+ // Pick the first non-empty level after the start_level
+ for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+ output_level++) {
+ if (vstorage_->NumLevelFiles(output_level) != 0) {
+ break;
+ }
+ }
+
+ // If all higher levels are empty, pick the highest level as output level
+ if (output_level == vstorage_->num_levels()) {
+ if (start_level == 0) {
+ output_level = vstorage_->num_levels() - 1;
+ } else {
+ // If start level is non-zero and all higher levels are empty, this
+ // compaction will translate into a trivial move. Since the idea is
+ // to reclaim space and trivial move doesn't help with that, we
+ // skip compaction in this case and return nullptr
+ return nullptr;
+ }
+ }
+ if (ioptions_.allow_ingest_behind &&
+ output_level == vstorage_->num_levels() - 1) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ if (output_level != 0) {
+ if (start_level == 0) {
+ if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+ output_level, nullptr)) {
+ return nullptr;
+ }
+ }
+
+ CompactionInputFiles output_level_inputs;
+ int parent_index = -1;
+
+ output_level_inputs.level = output_level;
+ if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+ &start_level_inputs, &output_level_inputs,
+ &parent_index, -1)) {
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ if (!output_level_inputs.empty()) {
+ inputs.push_back(output_level_inputs);
+ }
+ if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
+ return nullptr;
+ }
+ } else {
+ inputs.push_back(start_level_inputs);
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ // Use size of the output level as estimated file size
+ for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+ estimated_total_size += f->fd.GetFileSize();
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+ output_level, 1),
+ GetCompressionOptions(ioptions_, vstorage_, output_level),
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
+ score_, false /* deletion_compaction */,
+ CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+ size_t start_index, CompactionReason compaction_reason) {
+ assert(start_index < sorted_runs_.size());
+
+ // Estimate total file size
+ uint64_t estimated_total_size = 0;
+ for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+ estimated_total_size += sorted_runs_[loop].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+ auto& picking_sr = sorted_runs_[loop];
+ if (picking_sr.level == 0) {
+ FileMetaData* f = picking_sr.file;
+ inputs[0].files.push_back(f);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ std::string comp_reason_print_string;
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ comp_reason_print_string = "periodic compaction";
+ } else if (compaction_reason ==
+ CompactionReason::kUniversalSizeAmplification) {
+ comp_reason_print_string = "size amp";
+ } else {
+ assert(false);
+ }
+
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+ cf_name_.c_str(), comp_reason_print_string.c_str(),
+ file_num_buf);
+ }
+
+ // output files at the bottom most level, unless it's reserved
+ int output_level = vstorage_->num_levels() - 1;
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ // We never check size for
+ // compaction_options_universal.compression_size_percent,
+ // because we always compact all the files, so always compress.
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+ output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ LLONG_MAX, path_id,
+ GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+ 1, true /* enable_compression */),
+ GetCompressionOptions(ioptions_, vstorage_, start_level,
+ true /* enable_compression */),
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ score_, false /* deletion_compaction */, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+ cf_name_.c_str());
+
+ // In universal compaction, sorted runs contain older data are almost always
+ // generated earlier too. To simplify the problem, we just try to trigger
+ // a full compaction. We start from the oldest sorted run and include
+ // all sorted runs, until we hit a sorted already being compacted.
+ // Since usually the largest (which is usually the oldest) sorted run is
+ // included anyway, doing a full compaction won't increase write
+ // amplification much.
+
+ // Get some information from marked files to check whether a file is
+ // included in the compaction.
+
+ size_t start_index = sorted_runs_.size();
+ while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+ start_index--;
+ }
+ if (start_index == sorted_runs_.size()) {
+ return nullptr;
+ }
+
+ // There is a rare corner case where we can't pick up all the files
+ // because some files are being compacted and we end up with picking files
+ // but none of them need periodic compaction. Unless we simply recompact
+ // the last sorted run (either the last level or last L0 file), we would just
+ // execute the compaction, in order to simplify the logic.
+ if (start_index == sorted_runs_.size() - 1) {
+ bool included_file_marked = false;
+ int start_level = sorted_runs_[start_index].level;
+ FileMetaData* start_file = sorted_runs_[start_index].file;
+ for (const std::pair<int, FileMetaData*>& level_file_pair :
+ vstorage_->FilesMarkedForPeriodicCompaction()) {
+ if (start_level != 0) {
+ // Last sorted run is a level
+ if (start_level == level_file_pair.first) {
+ included_file_marked = true;
+ break;
+ }
+ } else {
+ // Last sorted run is a L0 file.
+ if (start_file == level_file_pair.second) {
+ included_file_marked = true;
+ break;
+ }
+ }
+ }
+ if (!included_file_marked) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Cannot form a compaction covering file "
+ "marked for periodic compaction",
+ cf_name_.c_str());
+ return nullptr;
+ }
+ }
+
+ Compaction* c = PickCompactionToOldest(start_index,
+ CompactionReason::kPeriodicCompaction);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+ return c;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..c3f55f5d3
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+ UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE