diff options
Diffstat (limited to 'src/rocksdb/db/compaction')
32 files changed, 25270 insertions, 0 deletions
diff --git a/src/rocksdb/db/compaction/clipping_iterator.h b/src/rocksdb/db/compaction/clipping_iterator.h new file mode 100644 index 000000000..1ed465c2c --- /dev/null +++ b/src/rocksdb/db/compaction/clipping_iterator.h @@ -0,0 +1,276 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> + +#include "rocksdb/comparator.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that wraps another one and ensures that any keys +// returned are strictly within a range [start, end). If the underlying +// iterator has already performed the bounds checking, it relies on that result; +// otherwise, it performs the necessary key comparisons itself. Both bounds +// are optional. +class ClippingIterator : public InternalIterator { + public: + ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end, + const CompareInterface* cmp) + : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) { + assert(iter_); + assert(cmp_); + assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0); + + UpdateAndEnforceBounds(); + } + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + if (start_) { + iter_->Seek(*start_); + } else { + iter_->SeekToFirst(); + } + + UpdateAndEnforceUpperBound(); + } + + void SeekToLast() override { + if (end_) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + } else { + iter_->SeekToLast(); + } + + UpdateAndEnforceLowerBound(); + } + + void Seek(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + iter_->Seek(*start_); + UpdateAndEnforceUpperBound(); + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + valid_ = false; + return; + } + + iter_->Seek(target); + UpdateAndEnforceUpperBound(); + } + + void SeekForPrev(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + valid_ = false; + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + + UpdateAndEnforceLowerBound(); + return; + } + + iter_->SeekForPrev(target); + UpdateAndEnforceLowerBound(); + } + + void Next() override { + assert(valid_); + iter_->Next(); + UpdateAndEnforceUpperBound(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(valid_); + assert(result); + + IterateResult res; + valid_ = iter_->NextAndGetResult(&res); + + if (!valid_) { + return false; + } + + if (end_) { + EnforceUpperBoundImpl(res.bound_check_result); + + if (!valid_) { + return false; + } + } + + res.bound_check_result = IterBoundCheck::kInbound; + *result = res; + + return true; + } + + void Prev() override { + assert(valid_); + iter_->Prev(); + UpdateAndEnforceLowerBound(); + } + + Slice key() const override { + assert(valid_); + return iter_->key(); + } + + Slice user_key() const override { + assert(valid_); + return iter_->user_key(); + } + + Slice value() const override { + assert(valid_); + return iter_->value(); + } + + Status status() const override { return iter_->status(); } + + bool PrepareValue() override { + assert(valid_); + + if (iter_->PrepareValue()) { + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } + + bool MayBeOutOfLowerBound() override { + assert(valid_); + return false; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(valid_); + return IterBoundCheck::kInbound; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(valid_); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(valid_); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateValid() { + assert(!iter_->Valid() || iter_->status().ok()); + + valid_ = iter_->Valid(); + } + + void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) { + if (bound_check_result == IterBoundCheck::kInbound) { + return; + } + + if (bound_check_result == IterBoundCheck::kOutOfBound) { + valid_ = false; + return; + } + + assert(bound_check_result == IterBoundCheck::kUnknown); + + if (cmp_->Compare(key(), *end_) >= 0) { + valid_ = false; + } + } + + void EnforceUpperBound() { + if (!valid_) { + return; + } + + if (!end_) { + return; + } + + EnforceUpperBoundImpl(iter_->UpperBoundCheckResult()); + } + + void EnforceLowerBound() { + if (!valid_) { + return; + } + + if (!start_) { + return; + } + + if (!iter_->MayBeOutOfLowerBound()) { + return; + } + + if (cmp_->Compare(key(), *start_) < 0) { + valid_ = false; + } + } + + void AssertBounds() { + assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0); + assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0); + } + + void UpdateAndEnforceBounds() { + UpdateValid(); + EnforceUpperBound(); + EnforceLowerBound(); + AssertBounds(); + } + + void UpdateAndEnforceUpperBound() { + UpdateValid(); + EnforceUpperBound(); + AssertBounds(); + } + + void UpdateAndEnforceLowerBound() { + UpdateValid(); + EnforceLowerBound(); + AssertBounds(); + } + + InternalIterator* iter_; + const Slice* start_; + const Slice* end_; + const CompareInterface* cmp_; + bool valid_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/clipping_iterator_test.cc b/src/rocksdb/db/compaction/clipping_iterator_test.cc new file mode 100644 index 000000000..b2b167048 --- /dev/null +++ b/src/rocksdb/db/compaction/clipping_iterator_test.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/clipping_iterator.h" + +#include <algorithm> +#include <memory> +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// A vector iterator which does its own bounds checking. This is for testing the +// optimizations in the clipping iterator where we bypass the bounds checking if +// the input iterator has already performed it. +class BoundsCheckingVectorIterator : public VectorIterator { + public: + BoundsCheckingVectorIterator(const std::vector<std::string>& keys, + const std::vector<std::string>& values, + const Slice* start, const Slice* end, + const Comparator* cmp) + : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) { + assert(cmp_); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + assert(result); + + Next(); + + if (!Valid()) { + return false; + } + + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = true; + + return true; + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + + if (!start_) { + return false; + } + + return cmp_->Compare(key(), *start_) < 0; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + + if (!end_) { + return IterBoundCheck::kInbound; + } + + return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound + : IterBoundCheck::kInbound; + } + + private: + const Slice* start_; + const Slice* end_; + const Comparator* cmp_; +}; + +class ClippingIteratorTest + : public ::testing::Test, + public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {}; + +TEST_P(ClippingIteratorTest, Clip) { + const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4", + "key5", "key6", "key7", "key8", "key9"}; + const std::vector<std::string> values{ + "unused0", "value1", "value2", "value3", "unused4", + "unused5", "unused6", "unused7", "unused8", "unused9"}; + + assert(keys.size() == values.size()); + + // Note: the input always contains key1, key2, and key3; however, the clipping + // window is based on the test parameters: its left edge is a value in the + // range [0, 4], and its size is a value in the range [0, 5] + const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]}; + const std::vector<std::string> input_values{values[1], values[2], values[3]}; + + const bool use_bounds_checking_vec_it = std::get<0>(GetParam()); + + const size_t clip_start_idx = std::get<1>(GetParam()); + const size_t clip_window_size = std::get<2>(GetParam()); + const size_t clip_end_idx = clip_start_idx + clip_window_size; + + const Slice start(keys[clip_start_idx]); + const Slice end(keys[clip_end_idx]); + + std::unique_ptr<InternalIterator> input( + use_bounds_checking_vec_it + ? new BoundsCheckingVectorIterator(input_keys, input_values, &start, + &end, BytewiseComparator()) + : new VectorIterator(input_keys, input_values, BytewiseComparator())); + + ClippingIterator clip(input.get(), &start, &end, BytewiseComparator()); + + // The range the clipping iterator should return values from. This is + // essentially the intersection of the input range [1, 4) and the clipping + // window [clip_start_idx, clip_end_idx) + const size_t data_start_idx = + std::max(clip_start_idx, static_cast<size_t>(1)); + const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4)); + + // Range is empty; all Seeks should fail + if (data_start_idx >= data_end_idx) { + clip.SeekToFirst(); + ASSERT_FALSE(clip.Valid()); + + clip.SeekToLast(); + ASSERT_FALSE(clip.Valid()); + + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + ASSERT_FALSE(clip.Valid()); + + clip.SeekForPrev(keys[i]); + ASSERT_FALSE(clip.Valid()); + } + + return; + } + + // Range is non-empty; call SeekToFirst and iterate forward + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + clip.Next(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Next(); + ASSERT_FALSE(clip.Valid()); + + // Do it again using NextAndGetResult + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + IterateResult result; + ASSERT_TRUE(clip.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[i]); + ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + IterateResult result; + ASSERT_FALSE(clip.NextAndGetResult(&result)); + ASSERT_FALSE(clip.Valid()); + + // Call SeekToLast and iterate backward + clip.SeekToLast(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) { + clip.Prev(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Prev(); + ASSERT_FALSE(clip.Valid()); + + // Call Seek/SeekForPrev for all keys; Seek should return the smallest key + // which is >= the target; SeekForPrev should return the largest key which is + // <= the target + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + + if (i < data_start_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_FALSE(clip.Valid()); + } + + clip.SeekForPrev(keys[i]); + + if (i < data_start_idx) { + ASSERT_FALSE(clip.Valid()); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + } +} + +INSTANTIATE_TEST_CASE_P( + ClippingIteratorTest, ClippingIteratorTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)), + ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6)))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc new file mode 100644 index 000000000..a32b529f7 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.cc @@ -0,0 +1,855 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction.h" + +#include <cinttypes> +#include <vector> + +#include "db/column_family.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/sst_partitioner.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + +uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->fd.GetFileSize(); + } + return sum; +} + +void Compaction::SetInputVersion(Version* _input_version) { + input_version_ = _input_version; + cfd_ = input_version_->cfd(); + + cfd_->Ref(); + input_version_->Ref(); + edit_.SetColumnFamily(cfd_->GetID()); +} + +void Compaction::GetBoundaryKeys( + VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key, + Slice* largest_user_key, int exclude_level) { + bool initialized = false; + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty() || inputs[i].level == exclude_level) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + const Slice& start_user_key = f->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = f->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = inputs[i].files[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = inputs[i].files.back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + +std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = &f->largest; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return inputs; +} + +// helper function to determine if compaction is creating files at the +// bottommost level +bool Compaction::IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs) { + int output_l0_idx; + if (output_level == 0) { + output_l0_idx = 0; + for (const auto* file : vstorage->LevelFiles(0)) { + if (inputs[0].files.back() == file) { + break; + } + ++output_l0_idx; + } + assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size()); + } else { + output_l0_idx = -1; + } + Slice smallest_key, largest_key; + GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key); + return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key, + output_level, output_l0_idx); +} + +// test function to validate the functionality of IsBottommostLevel() +// function -- determines if compaction with inputs and storage is bottommost +bool Compaction::TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs) { + return IsBottommostLevel(output_level, vstorage, inputs); +} + +bool Compaction::IsFullCompaction( + VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs) { + size_t num_files_in_compaction = 0; + size_t total_num_files = 0; + for (int l = 0; l < vstorage->num_levels(); l++) { + total_num_files += vstorage->NumLevelFiles(l); + } + for (size_t i = 0; i < inputs.size(); i++) { + num_files_in_compaction += inputs[i].size(); + } + return num_files_in_compaction == total_num_files; +} + +Compaction::Compaction( + VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, + const MutableCFOptions& _mutable_cf_options, + const MutableDBOptions& _mutable_db_options, + std::vector<CompactionInputFiles> _inputs, int _output_level, + uint64_t _target_file_size, uint64_t _max_compaction_bytes, + uint32_t _output_path_id, CompressionType _compression, + CompressionOptions _compression_opts, Temperature _output_temperature, + uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents, + bool _manual_compaction, const std::string& _trim_ts, double _score, + bool _deletion_compaction, bool l0_files_might_overlap, + CompactionReason _compaction_reason, + BlobGarbageCollectionPolicy _blob_garbage_collection_policy, + double _blob_garbage_collection_age_cutoff) + : input_vstorage_(vstorage), + start_level_(_inputs[0].level), + output_level_(_output_level), + target_output_file_size_(_target_file_size), + max_compaction_bytes_(_max_compaction_bytes), + max_subcompactions_(_max_subcompactions), + immutable_options_(_immutable_options), + mutable_cf_options_(_mutable_cf_options), + input_version_(nullptr), + number_levels_(vstorage->num_levels()), + cfd_(nullptr), + output_path_id_(_output_path_id), + output_compression_(_compression), + output_compression_opts_(_compression_opts), + output_temperature_(_output_temperature), + deletion_compaction_(_deletion_compaction), + l0_files_might_overlap_(l0_files_might_overlap), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), + grandparents_(std::move(_grandparents)), + score_(_score), + bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), + is_full_compaction_(IsFullCompaction(vstorage, inputs_)), + is_manual_compaction_(_manual_compaction), + trim_ts_(_trim_ts), + is_trivial_move_(false), + + compaction_reason_(_compaction_reason), + notify_on_compaction_completion_(false), + enable_blob_garbage_collection_( + _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce + ? true + : (_blob_garbage_collection_policy == + BlobGarbageCollectionPolicy::kDisable + ? false + : mutable_cf_options()->enable_blob_garbage_collection)), + blob_garbage_collection_age_cutoff_( + _blob_garbage_collection_age_cutoff < 0 || + _blob_garbage_collection_age_cutoff > 1 + ? mutable_cf_options()->blob_garbage_collection_age_cutoff + : _blob_garbage_collection_age_cutoff), + penultimate_level_(EvaluatePenultimateLevel( + vstorage, immutable_options_, start_level_, output_level_)) { + MarkFilesBeingCompacted(true); + if (is_manual_compaction_) { + compaction_reason_ = CompactionReason::kManualCompaction; + } + if (max_subcompactions_ == 0) { + max_subcompactions_ = _mutable_db_options.max_subcompactions; + } + + // for the non-bottommost levels, it tries to build files match the target + // file size, but not guaranteed. It could be 2x the size of the target size. + max_output_file_size_ = + bottommost_level_ || grandparents_.empty() || + !_immutable_options.level_compaction_dynamic_file_size + ? target_output_file_size_ + : 2 * target_output_file_size_; + +#ifndef NDEBUG + for (size_t i = 1; i < inputs_.size(); ++i) { + assert(inputs_[i].level > inputs_[i - 1].level); + } +#endif + + // setup input_levels_ + { + input_levels_.resize(num_input_levels()); + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); + } + } + + GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_); + + // Every compaction regardless of any compaction reason may respect the + // existing compact cursor in the output level to split output files + output_split_key_ = nullptr; + if (immutable_options_.compaction_style == kCompactionStyleLevel && + immutable_options_.compaction_pri == kRoundRobin) { + const InternalKey* cursor = + &input_vstorage_->GetCompactCursors()[output_level_]; + if (cursor->size() != 0) { + const Slice& cursor_user_key = ExtractUserKey(cursor->Encode()); + auto ucmp = vstorage->InternalComparator()->user_comparator(); + // May split output files according to the cursor if it in the user-key + // range + if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) > + 0 && + ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <= + 0) { + output_split_key_ = cursor; + } + } + } + + PopulatePenultimateLevelOutputRange(); +} + +void Compaction::PopulatePenultimateLevelOutputRange() { + if (!SupportsPerKeyPlacement()) { + return; + } + + // exclude the last level, the range of all input levels is the safe range + // of keys that can be moved up. + int exclude_level = number_levels_ - 1; + penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange; + + // For universal compaction, the penultimate_output_range could be extended if + // all penultimate level files are included in the compaction (which includes + // the case that the penultimate level is empty). + if (immutable_options_.compaction_style == kCompactionStyleUniversal) { + exclude_level = kInvalidLevel; + std::set<uint64_t> penultimate_inputs; + for (const auto& input_lvl : inputs_) { + if (input_lvl.level == penultimate_level_) { + for (const auto& file : input_lvl.files) { + penultimate_inputs.emplace(file->fd.GetNumber()); + } + } + } + auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); + for (const auto& file : penultimate_files) { + if (penultimate_inputs.find(file->fd.GetNumber()) == + penultimate_inputs.end()) { + exclude_level = number_levels_ - 1; + penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange; + break; + } + } + } + + GetBoundaryKeys(input_vstorage_, inputs_, + &penultimate_level_smallest_user_key_, + &penultimate_level_largest_user_key_, exclude_level); + + // If there's a case that the penultimate level output range is overlapping + // with the existing files, disable the penultimate level output by setting + // the range to empty. One example is the range delete could have overlap + // boundary with the next file. (which is actually a false overlap) + // TODO: Exclude such false overlap, so it won't disable the penultimate + // output. + std::set<uint64_t> penultimate_inputs; + for (const auto& input_lvl : inputs_) { + if (input_lvl.level == penultimate_level_) { + for (const auto& file : input_lvl.files) { + penultimate_inputs.emplace(file->fd.GetNumber()); + } + } + } + + auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); + for (const auto& file : penultimate_files) { + if (penultimate_inputs.find(file->fd.GetNumber()) == + penultimate_inputs.end() && + OverlapPenultimateLevelOutputRange(file->smallest.user_key(), + file->largest.user_key())) { + // basically disable the penultimate range output. which should be rare + // or a false overlap caused by range del + penultimate_level_smallest_user_key_ = ""; + penultimate_level_largest_user_key_ = ""; + penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled; + } + } +} + +Compaction::~Compaction() { + if (input_version_ != nullptr) { + input_version_->Unref(); + } + if (cfd_ != nullptr) { + cfd_->UnrefAndTryDelete(); + } +} + +bool Compaction::SupportsPerKeyPlacement() const { + return penultimate_level_ != kInvalidLevel; +} + +int Compaction::GetPenultimateLevel() const { return penultimate_level_; } + +// smallest_key and largest_key include timestamps if user-defined timestamp is +// enabled. +bool Compaction::OverlapPenultimateLevelOutputRange( + const Slice& smallest_key, const Slice& largest_key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->CompareWithoutTimestamp( + smallest_key, penultimate_level_largest_user_key_) <= 0 && + ucmp->CompareWithoutTimestamp( + largest_key, penultimate_level_smallest_user_key_) >= 0; +} + +// key includes timestamp if user-defined timestamp is enabled. +bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const { + if (!SupportsPerKeyPlacement()) { + return false; + } + + if (penultimate_level_smallest_user_key_.empty() || + penultimate_level_largest_user_key_.empty()) { + return false; + } + + const Comparator* ucmp = + input_vstorage_->InternalComparator()->user_comparator(); + + return ucmp->CompareWithoutTimestamp( + key, penultimate_level_smallest_user_key_) >= 0 && + ucmp->CompareWithoutTimestamp( + key, penultimate_level_largest_user_key_) <= 0; +} + +bool Compaction::InputCompressionMatchesOutput() const { + int base_level = input_vstorage_->base_level(); + bool matches = + (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_, + base_level) == output_compression_); + if (matches) { + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches"); + return true; + } + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch"); + return matches; +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If start_level_== output_level_, the purpose is to force compaction + // filter to be applied to that level, and thus cannot be a trivial move. + + // Check if start level have files with overlapping ranges + if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false && + l0_files_might_overlap_) { + // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are + // overlapping, unless we are sure that files picked in L0 don't overlap. + return false; + } + + if (is_manual_compaction_ && + (immutable_options_.compaction_filter != nullptr || + immutable_options_.compaction_filter_factory != nullptr)) { + // This is a manual compaction and we have a compaction filter that should + // be executed, we cannot do a trivial move + return false; + } + + if (start_level_ == output_level_) { + // It doesn't make sense if compaction picker picks files just to trivial + // move to the same level. + return false; + } + + // Used in universal compaction, where trivial move can be done if the + // input files are non overlapping + if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && + (output_level_ != 0) && + (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) { + return is_trivial_move_; + } + + if (!(start_level_ != output_level_ && num_input_levels() == 1 && + input(0, 0)->fd.GetPathId() == output_path_id() && + InputCompressionMatchesOutput())) { + return false; + } + + // assert inputs_.size() == 1 + + std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner(); + + for (const auto& file : inputs_.front().files) { + std::vector<FileMetaData*> file_grand_parents; + if (output_level_ + 1 >= number_levels_) { + continue; + } + input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + &file->largest, &file_grand_parents); + const auto compaction_size = + file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + if (compaction_size > max_compaction_bytes_) { + return false; + } + + if (partitioner.get() != nullptr) { + if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), + file->largest.user_key())) { + return false; + } + } + } + + // PerKeyPlacement compaction should never be trivial move. + if (SupportsPerKeyPlacement()) { + return false; + } + + return true; +} + +void Compaction::AddInputDeletions(VersionEdit* out_edit) { + for (size_t which = 0; which < num_input_levels(); which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); + } + } +} + +bool Compaction::KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector<size_t>* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast<size_t>(number_levels_)); + if (bottommost_level_) { + return true; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector<FileMetaData*>& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + // In the presence of user-defined timestamp, we may need to handle + // the case in which f->smallest.user_key() (including ts) has the + // same user key, but the ts part is smaller. If so, + // Compare(user_key, f->smallest.user_key()) returns -1. + // That's why we need CompareWithoutTimestamp(). + if (user_cmp->CompareWithoutTimestamp(user_key, + f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so it may + // exist beyond output level + return false; + } + break; + } + } + } + return true; + } + return false; +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { + for (size_t i = 0; i < num_input_levels(); i++) { + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = mark_as_compacted; + } + } +} + +// Sample output: +// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5, +// print: "3@0 + 2@3 + 1@4 files to L5" +const char* Compaction::InputLevelSummary( + InputLevelSummaryBuffer* scratch) const { + int len = 0; + bool is_first = true; + for (auto& input_level : inputs_) { + if (input_level.empty()) { + continue; + } + if (!is_first) { + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + "); + len = std::min(len, static_cast<int>(sizeof(scratch->buffer))); + } else { + is_first = false; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "%" ROCKSDB_PRIszt "@%d", input_level.size(), + input_level.level); + len = std::min(len, static_cast<int>(sizeof(scratch->buffer))); + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " files to L%d", output_level()); + + return scratch->buffer; +} + +uint64_t Compaction::CalculateTotalInputSize() const { + uint64_t size = 0; + for (auto& input_level : inputs_) { + for (auto f : input_level.files) { + size += f->fd.GetFileSize(); + } + } + return size; +} + +void Compaction::ReleaseCompactionFiles(Status status) { + MarkFilesBeingCompacted(false); + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); +} + +void Compaction::ResetNextCompactionIndex() { + assert(input_version_ != nullptr); + input_vstorage_->ResetNextCompactionIndex(start_level_); +} + +namespace { +int InputSummary(const std::vector<FileMetaData*>& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (size_t i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); + if (ret < 0 || ret >= sz) break; + write += ret; + } + // if files.size() is non-zero, overwrite the last space + return write - !!files.size(); +} +} // namespace + +void Compaction::Summary(char* output, int len) { + int write = + snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [", + input_version_->GetVersionNumber(), start_level_); + if (write < 0 || write >= len) { + return; + } + + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + if (level_iter > 0) { + write += snprintf(output + write, len - write, "], ["); + if (write < 0 || write >= len) { + return; + } + } + write += + InputSummary(inputs_[level_iter].files, output + write, len - write); + if (write < 0 || write >= len) { + return; + } + } + + snprintf(output + write, len - write, "]"); +} + +uint64_t Compaction::OutputFilePreallocationSize() const { + uint64_t preallocation_size = 0; + + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + preallocation_size += file->fd.GetFileSize(); + } + } + + if (max_output_file_size_ != std::numeric_limits<uint64_t>::max() && + (immutable_options_.compaction_style == kCompactionStyleLevel || + output_level() > 0)) { + preallocation_size = std::min(max_output_file_size_, preallocation_size); + } + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold + // No point to preallocate more than 1GB. + return std::min(uint64_t{1073741824}, + preallocation_size + (preallocation_size / 10)); +} + +std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const { + if (!cfd_->ioptions()->compaction_filter_factory) { + return nullptr; + } + + if (!cfd_->ioptions() + ->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kCompaction)) { + return nullptr; + } + + CompactionFilter::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.column_family_id = cfd_->GetID(); + context.reason = TableFileCreationReason::kCompaction; + return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( + context); +} + +std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const { + if (!immutable_options_.sst_partitioner_factory) { + return nullptr; + } + + SstPartitioner::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.output_level = output_level_; + context.smallest_user_key = smallest_user_key_; + context.largest_user_key = largest_user_key_; + return immutable_options_.sst_partitioner_factory->CreatePartitioner(context); +} + +bool Compaction::IsOutputLevelEmpty() const { + return inputs_.back().level != output_level_ || inputs_.back().empty(); +} + +bool Compaction::ShouldFormSubcompactions() const { + if (cfd_ == nullptr) { + return false; + } + + // Round-Robin pri under leveled compaction allows subcompactions by default + // and the number of subcompactions can be larger than max_subcompactions_ + if (cfd_->ioptions()->compaction_pri == kRoundRobin && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return output_level_ > 0; + } + + if (max_subcompactions_ <= 1) { + return false; + } + + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0; + } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + return number_levels_ > 1 && output_level_ > 0; + } else { + return false; + } +} + +bool Compaction::DoesInputReferenceBlobFiles() const { + assert(input_version_); + + const VersionStorageInfo* storage_info = input_version_->storage_info(); + assert(storage_info); + + if (storage_info->GetBlobFiles().empty()) { + return false; + } + + for (size_t i = 0; i < inputs_.size(); ++i) { + for (const FileMetaData* meta : inputs_[i].files) { + assert(meta); + + if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) { + return true; + } + } + } + + return false; +} + +uint64_t Compaction::MinInputFileOldestAncesterTime( + const InternalKey* start, const InternalKey* end) const { + uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max(); + const InternalKeyComparator& icmp = + column_family_data()->internal_comparator(); + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + if (start != nullptr && icmp.Compare(file->largest, *start) < 0) { + continue; + } + if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) { + continue; + } + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0) { + min_oldest_ancester_time = + std::min(min_oldest_ancester_time, oldest_ancester_time); + } + } + } + return min_oldest_ancester_time; +} + +int Compaction::EvaluatePenultimateLevel( + const VersionStorageInfo* vstorage, + const ImmutableOptions& immutable_options, const int start_level, + const int output_level) { + // TODO: currently per_key_placement feature only support level and universal + // compaction + if (immutable_options.compaction_style != kCompactionStyleLevel && + immutable_options.compaction_style != kCompactionStyleUniversal) { + return kInvalidLevel; + } + if (output_level != immutable_options.num_levels - 1) { + return kInvalidLevel; + } + + int penultimate_level = output_level - 1; + assert(penultimate_level < immutable_options.num_levels); + if (penultimate_level <= 0) { + return kInvalidLevel; + } + + // If the penultimate level is not within input level -> output level range + // check if the penultimate output level is empty, if it's empty, it could + // also be locked for the penultimate output. + // TODO: ideally, it only needs to check if there's a file within the + // compaction output key range. For simplicity, it just check if there's any + // file on the penultimate level. + if (start_level == immutable_options.num_levels - 1 && + (immutable_options.compaction_style != kCompactionStyleUniversal || + !vstorage->LevelFiles(penultimate_level).empty())) { + return kInvalidLevel; + } + + bool supports_per_key_placement = + immutable_options.preclude_last_level_data_seconds > 0; + + // it could be overridden by unittest + TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled", + &supports_per_key_placement); + if (!supports_per_key_placement) { + return kInvalidLevel; + } + + return penultimate_level; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h new file mode 100644 index 000000000..21d1190ac --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.h @@ -0,0 +1,559 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/sst_partitioner.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. + +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; +}; + +// The structure that manages compaction input files associated +// with the same physical level. +struct CompactionInputFiles { + int level; + std::vector<FileMetaData*> files; + std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries; + inline bool empty() const { return files.empty(); } + inline size_t size() const { return files.size(); } + inline void clear() { files.clear(); } + inline FileMetaData* operator[](size_t i) const { return files[i]; } +}; + +class Version; +class ColumnFamilyData; +class VersionStorageInfo; +class CompactionFilter; + +// A Compaction encapsulates metadata about a compaction. +class Compaction { + public: + Compaction(VersionStorageInfo* input_version, + const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + std::vector<CompactionInputFiles> inputs, int output_level, + uint64_t target_file_size, uint64_t max_compaction_bytes, + uint32_t output_path_id, CompressionType compression, + CompressionOptions compression_opts, + Temperature output_temperature, uint32_t max_subcompactions, + std::vector<FileMetaData*> grandparents, + bool manual_compaction = false, const std::string& trim_ts = "", + double score = -1, bool deletion_compaction = false, + bool l0_files_might_overlap = true, + CompactionReason compaction_reason = CompactionReason::kUnknown, + BlobGarbageCollectionPolicy blob_garbage_collection_policy = + BlobGarbageCollectionPolicy::kUseDefault, + double blob_garbage_collection_age_cutoff = -1); + + // The type of the penultimate level output range + enum class PenultimateOutputRangeType : int { + kNotSupported, // it cannot output to the penultimate level + kFullRange, // any data could be output to the penultimate level + kNonLastRange, // only the keys within non_last_level compaction inputs can + // be outputted to the penultimate level + kDisabled, // no data can be outputted to the penultimate level + }; + + // No copying allowed + Compaction(const Compaction&) = delete; + void operator=(const Compaction&) = delete; + + ~Compaction(); + + // Returns the level associated to the specified compaction input level. + // If compaction_input_level is not specified, then input_level is set to 0. + int level(size_t compaction_input_level = 0) const { + return inputs_[compaction_input_level].level; + } + + int start_level() const { return start_level_; } + + // Outputs will go to this level + int output_level() const { return output_level_; } + + // Returns the number of input levels in this compaction. + size_t num_input_levels() const { return inputs_.size(); } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // Returns the number of input files associated to the specified + // compaction input level. + // The function will return 0 if when "compaction_input_level" < 0 + // or "compaction_input_level" >= "num_input_levels()". + size_t num_input_files(size_t compaction_input_level) const { + if (compaction_input_level < inputs_.size()) { + return inputs_[compaction_input_level].size(); + } + return 0; + } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + // Returns the ColumnFamilyData associated with the compaction. + ColumnFamilyData* column_family_data() const { return cfd_; } + + // Returns the file meta data of the 'i'th input file at the + // specified compaction input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + FileMetaData* input(size_t compaction_input_level, size_t i) const { + assert(compaction_input_level < inputs_.size()); + return inputs_[compaction_input_level][i]; + } + + const std::vector<AtomicCompactionUnitBoundary>* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + + // Returns the list of file meta data of the specified compaction + // input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + const std::vector<FileMetaData*>* inputs( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].files; + } + + const std::vector<CompactionInputFiles>* inputs() { return &inputs_; } + + // Returns the LevelFilesBrief of the specified compaction input level. + const LevelFilesBrief* input_levels(size_t compaction_input_level) const { + return &input_levels_[compaction_input_level]; + } + + // Maximum size of files to build during this compaction. + uint64_t max_output_file_size() const { return max_output_file_size_; } + + // Target output file size for this compaction + uint64_t target_output_file_size() const { return target_output_file_size_; } + + // What compression for output + CompressionType output_compression() const { return output_compression_; } + + // What compression options for output + const CompressionOptions& output_compression_opts() const { + return output_compression_opts_; + } + + // Whether need to write output file to second DB path. + uint32_t output_path_id() const { return output_path_id_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // The split user key in the output level if this compaction is required to + // split the output files according to the existing cursor in the output + // level under round-robin compaction policy. Empty indicates no required + // splitting key + const InternalKey* GetOutputSplitKey() const { return output_split_key_; } + + // If true, then the compaction can be done by simply deleting input files. + bool deletion_compaction() const { return deletion_compaction_; } + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the available information we have guarantees that + // the input "user_key" does not exist in any level beyond "output_level()". + bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, + std::vector<size_t>* level_ptrs) const; + + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Status status); + + // Returns the summary of the compaction in "output" with maximum "len" + // in bytes. The caller is responsible for the memory management of + // "output". + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level() const { return bottommost_level_; } + + // Is the compaction compact to the last level + bool is_last_level() const { + return output_level_ == immutable_options_.num_levels - 1; + } + + // Does this compaction include all sst files? + bool is_full_compaction() const { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool is_manual_compaction() const { return is_manual_compaction_; } + + std::string trim_ts() const { return trim_ts_; } + + // Used when allow_trivial_move option is set in + // Universal compaction. If all the input files are + // non overlapping, then is_trivial_move_ variable + // will be set true, else false + void set_is_trivial_move(bool trivial_move) { + is_trivial_move_ = trivial_move; + } + + // Used when allow_trivial_move option is set in + // Universal compaction. Returns true, if the input files + // are non-overlapping and can be trivially moved. + bool is_trivial_move() const { return is_trivial_move_; } + + // How many total levels are there? + int number_levels() const { return number_levels_; } + + // Return the ImmutableOptions that should be used throughout the compaction + // procedure + const ImmutableOptions* immutable_options() const { + return &immutable_options_; + } + + // Return the MutableCFOptions that should be used throughout the compaction + // procedure + const MutableCFOptions* mutable_cf_options() const { + return &mutable_cf_options_; + } + + // Returns the size in bytes that the output file should be preallocated to. + // In level compaction, that is max_file_size_. In universal compaction, that + // is the sum of all input file sizes. + uint64_t OutputFilePreallocationSize() const; + + void SetInputVersion(Version* input_version); + + struct InputLevelSummaryBuffer { + char buffer[128]; + }; + + const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const; + + uint64_t CalculateTotalInputSize() const; + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); + + // Create a CompactionFilter from compaction_filter_factory + std::unique_ptr<CompactionFilter> CreateCompactionFilter() const; + + // Create a SstPartitioner from sst_partitioner_factory + std::unique_ptr<SstPartitioner> CreateSstPartitioner() const; + + // Is the input level corresponding to output_level_ empty? + bool IsOutputLevelEmpty() const; + + // Should this compaction be broken up into smaller ones run in parallel? + bool ShouldFormSubcompactions() const; + + // Returns true iff at least one input file references a blob file. + // + // PRE: input version has been set. + bool DoesInputReferenceBlobFiles() const; + + // test function to validate the functionality of IsBottommostLevel() + // function -- determines if compaction with inputs and storage is bottommost + static bool TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs); + + TablePropertiesCollection GetOutputTableProperties() const { + return output_table_properties_; + } + + void SetOutputTableProperties(TablePropertiesCollection tp) { + output_table_properties_ = std::move(tp); + } + + Slice GetSmallestUserKey() const { return smallest_user_key_; } + + Slice GetLargestUserKey() const { return largest_user_key_; } + + Slice GetPenultimateLevelSmallestUserKey() const { + return penultimate_level_smallest_user_key_; + } + + Slice GetPenultimateLevelLargestUserKey() const { + return penultimate_level_largest_user_key_; + } + + PenultimateOutputRangeType GetPenultimateOutputRangeType() const { + return penultimate_output_range_type_; + } + + // Return true if the compaction supports per_key_placement + bool SupportsPerKeyPlacement() const; + + // Get per_key_placement penultimate output level, which is `last_level - 1` + // if per_key_placement feature is supported. Otherwise, return -1. + int GetPenultimateLevel() const; + + // Return true if the given range is overlap with penultimate level output + // range. + // Both smallest_key and largest_key include timestamps if user-defined + // timestamp is enabled. + bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, + const Slice& largest_key) const; + + // Return true if the key is within penultimate level output range for + // per_key_placement feature, which is safe to place the key to the + // penultimate level. different compaction strategy has different rules. + // If per_key_placement is not supported, always return false. + // TODO: currently it doesn't support moving data from the last level to the + // penultimate level + // key includes timestamp if user-defined timestamp is enabled. + bool WithinPenultimateLevelOutputRange(const Slice& key) const; + + CompactionReason compaction_reason() const { return compaction_reason_; } + + const std::vector<FileMetaData*>& grandparents() const { + return grandparents_; + } + + uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } + + Temperature output_temperature() const { return output_temperature_; } + + uint32_t max_subcompactions() const { return max_subcompactions_; } + + bool enable_blob_garbage_collection() const { + return enable_blob_garbage_collection_; + } + + double blob_garbage_collection_age_cutoff() const { + return blob_garbage_collection_age_cutoff_; + } + + // start and end are sub compact range. Null if no boundary. + // This is used to filter out some input files' ancester's time range. + uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, + const InternalKey* end) const; + + // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of + // compaction begin and compaction completion callbacks match. + void SetNotifyOnCompactionCompleted() { + notify_on_compaction_completion_ = true; + } + + bool ShouldNotifyOnCompactionCompleted() const { + return notify_on_compaction_completion_; + } + + static constexpr int kInvalidLevel = -1; + + // Evaluate penultimate output level. If the compaction supports + // per_key_placement feature, it returns the penultimate level number. + // Otherwise, it's set to kInvalidLevel (-1), which means + // output_to_penultimate_level is not supported. + // Note: even the penultimate level output is supported (PenultimateLevel != + // kInvalidLevel), some key range maybe unsafe to be outputted to the + // penultimate level. The safe key range is populated by + // `PopulatePenultimateLevelOutputRange()`. + // Which could potentially disable all penultimate level output. + static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage, + const ImmutableOptions& immutable_options, + const int start_level, + const int output_level); + + private: + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool mark_as_compacted); + + // get the smallest and largest key present in files to be compacted + static void GetBoundaryKeys(VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs, + Slice* smallest_key, Slice* largest_key, + int exclude_level = -1); + + // populate penultimate level output range, which will be used to determine if + // a key is safe to output to the penultimate level (details see + // `Compaction::WithinPenultimateLevelOutputRange()`. + void PopulatePenultimateLevelOutputRange(); + + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and + // plumb down appropriate key boundaries to RangeDelAggregator during + // compaction. + static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs); + + // helper function to determine if compaction with inputs and storage is + // bottommost + static bool IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs); + + static bool IsFullCompaction(VersionStorageInfo* vstorage, + const std::vector<CompactionInputFiles>& inputs); + + VersionStorageInfo* input_vstorage_; + + const int start_level_; // the lowest level to be compacted + const int output_level_; // levels to which output files are stored + uint64_t target_output_file_size_; + uint64_t max_output_file_size_; + uint64_t max_compaction_bytes_; + uint32_t max_subcompactions_; + const ImmutableOptions immutable_options_; + const MutableCFOptions mutable_cf_options_; + Version* input_version_; + VersionEdit edit_; + const int number_levels_; + ColumnFamilyData* cfd_; + Arena arena_; // Arena used to allocate space for file_levels_ + + const uint32_t output_path_id_; + CompressionType output_compression_; + CompressionOptions output_compression_opts_; + Temperature output_temperature_; + // If true, then the compaction can be done by simply deleting input files. + const bool deletion_compaction_; + // should it split the output file using the compact cursor? + const InternalKey* output_split_key_; + + // L0 files in LSM-tree might be overlapping. But the compaction picking + // logic might pick a subset of the files that aren't overlapping. if + // that is the case, set the value to false. Otherwise, set it true. + bool l0_files_might_overlap_; + + // Compaction input files organized by level. Constant after construction + const std::vector<CompactionInputFiles> inputs_; + + // A copy of inputs_, organized more closely in memory + autovector<LevelFilesBrief, 2> input_levels_; + + // State used to check for number of overlapping grandparent files + // (grandparent == "output_level_ + 1") + std::vector<FileMetaData*> grandparents_; + const double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + const bool bottommost_level_; + // Does this compaction include all sst files? + const bool is_full_compaction_; + + // Is this compaction requested by the client? + const bool is_manual_compaction_; + + // The data with timestamp > trim_ts_ will be removed + const std::string trim_ts_; + + // True if we can do trivial move in Universal multi level + // compaction + bool is_trivial_move_; + + // Does input compression match the output compression? + bool InputCompressionMatchesOutput() const; + + // table properties of output files + TablePropertiesCollection output_table_properties_; + + // smallest user keys in compaction + // includes timestamp if user-defined timestamp is enabled. + Slice smallest_user_key_; + + // largest user keys in compaction + // includes timestamp if user-defined timestamp is enabled. + Slice largest_user_key_; + + // Reason for compaction + CompactionReason compaction_reason_; + + // Notify on compaction completion only if listener was notified on compaction + // begin. + bool notify_on_compaction_completion_; + + // Enable/disable GC collection for blobs during compaction. + bool enable_blob_garbage_collection_; + + // Blob garbage collection age cutoff. + double blob_garbage_collection_age_cutoff_; + + // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) + // means not supported. + const int penultimate_level_; + + // Key range for penultimate level output + // includes timestamp if user-defined timestamp is enabled. + // penultimate_output_range_type_ shows the range type + Slice penultimate_level_smallest_user_key_; + Slice penultimate_level_largest_user_key_; + PenultimateOutputRangeType penultimate_output_range_type_ = + PenultimateOutputRangeType::kNotSupported; +}; + +#ifndef NDEBUG +// Helper struct only for tests, which contains the data to decide if a key +// should be output to the penultimate level. +// TODO: remove this when the public feature knob is available +struct PerKeyPlacementContext { + const int level; + const Slice key; + const Slice value; + const SequenceNumber seq_num; + + bool output_to_penultimate_level; + + PerKeyPlacementContext(int _level, Slice _key, Slice _value, + SequenceNumber _seq_num) + : level(_level), key(_key), value(_value), seq_num(_seq_num) { + output_to_penultimate_level = false; + } +}; +#endif /* !NDEBUG */ + +// Return sum of sizes of all files in `files`. +extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h new file mode 100644 index 000000000..1b1c28b57 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h @@ -0,0 +1,49 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstdint> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +struct CompactionIterationStats { + // Compaction statistics + + // Doesn't include records skipped because of + // CompactionFilter::Decision::kRemoveAndSkipUntil. + int64_t num_record_drop_user = 0; + + int64_t num_record_drop_hidden = 0; + int64_t num_record_drop_obsolete = 0; + int64_t num_record_drop_range_del = 0; + int64_t num_range_del_drop_obsolete = 0; + // Deletions obsoleted before bottom level due to file gap optimization. + int64_t num_optimized_del_drop_obsolete = 0; + uint64_t total_filter_time = 0; + + // Input statistics + // TODO(noetzli): The stats are incomplete. They are lacking everything + // consumed by MergeHelper. + uint64_t num_input_records = 0; + uint64_t num_input_deletion_records = 0; + uint64_t num_input_corrupt_records = 0; + uint64_t total_input_raw_key_bytes = 0; + uint64_t total_input_raw_value_bytes = 0; + + // Single-Delete diagnostics for exceptional situations + uint64_t num_single_del_fallthru = 0; + uint64_t num_single_del_mismatch = 0; + + // Blob related statistics + uint64_t num_blobs_read = 0; + uint64_t total_blob_bytes_read = 0; + uint64_t num_blobs_relocated = 0; + uint64_t total_blob_bytes_relocated = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc new file mode 100644 index 000000000..9f54f7813 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.cc @@ -0,0 +1,1338 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/compaction_iterator.h" + +#include <iterator> +#include <limits> + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_builder.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" +#include "db/snapshot_checker.h" +#include "logging/logging.h" +#include "port/likely.h" +#include "rocksdb/listener.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic<bool>& manual_compaction_canceled, + const Compaction* compaction, const CompactionFilter* compaction_filter, + const std::atomic<bool>* shutting_down, + const std::shared_ptr<Logger> info_log, + const std::string* full_history_ts_low, + const SequenceNumber preserve_time_min_seqno, + const SequenceNumber preclude_last_level_min_seqno) + : CompactionIterator( + input, cmp, merge_helper, last_sequence, snapshots, + earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, + report_detailed_time, expect_valid_internal_key, range_del_agg, + blob_file_builder, allow_data_in_errors, enforce_single_del_contracts, + manual_compaction_canceled, + std::unique_ptr<CompactionProxy>( + compaction ? new RealCompaction(compaction) : nullptr), + compaction_filter, shutting_down, info_log, full_history_ts_low, + preserve_time_min_seqno, preclude_last_level_min_seqno) {} + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic<bool>& manual_compaction_canceled, + std::unique_ptr<CompactionProxy> compaction, + const CompactionFilter* compaction_filter, + const std::atomic<bool>* shutting_down, + const std::shared_ptr<Logger> info_log, + const std::string* full_history_ts_low, + const SequenceNumber preserve_time_min_seqno, + const SequenceNumber preclude_last_level_min_seqno) + : input_(input, cmp, + !compaction || compaction->DoesInputReferenceBlobFiles()), + cmp_(cmp), + merge_helper_(merge_helper), + snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + job_snapshot_(job_snapshot), + snapshot_checker_(snapshot_checker), + env_(env), + clock_(env_->GetSystemClock().get()), + report_detailed_time_(report_detailed_time), + expect_valid_internal_key_(expect_valid_internal_key), + range_del_agg_(range_del_agg), + blob_file_builder_(blob_file_builder), + compaction_(std::move(compaction)), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + manual_compaction_canceled_(manual_compaction_canceled), + bottommost_level_(!compaction_ ? false + : compaction_->bottommost_level() && + !compaction_->allow_ingest_behind()), + // snapshots_ cannot be nullptr, but we will assert later in the body of + // the constructor. + visible_at_tip_(snapshots_ ? snapshots_->empty() : false), + earliest_snapshot_(!snapshots_ || snapshots_->empty() + ? kMaxSequenceNumber + : snapshots_->at(0)), + info_log_(info_log), + allow_data_in_errors_(allow_data_in_errors), + enforce_single_del_contracts_(enforce_single_del_contracts), + timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0), + full_history_ts_low_(full_history_ts_low), + current_user_key_sequence_(0), + current_user_key_snapshot_(0), + merge_out_iter_(merge_helper_), + blob_garbage_collection_cutoff_file_number_( + ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())), + blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())), + prefetch_buffers_( + CreatePrefetchBufferCollectionIfNeeded(compaction_.get())), + current_key_committed_(false), + cmp_with_history_ts_low_(0), + level_(compaction_ == nullptr ? 0 : compaction_->level()), + preserve_time_min_seqno_(preserve_time_min_seqno), + preclude_last_level_min_seqno_(preclude_last_level_min_seqno) { + assert(snapshots_ != nullptr); + assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_); + + if (compaction_ != nullptr) { + level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } + assert(timestamp_size_ == 0 || !full_history_ts_low_ || + timestamp_size_ == full_history_ts_low_->size()); +#endif + input_.SetPinnedItersMgr(&pinned_iters_mgr_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); +} + +CompactionIterator::~CompactionIterator() { + // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime + input_.SetPinnedItersMgr(nullptr); +} + +void CompactionIterator::ResetRecordCounts() { + iter_stats_.num_record_drop_user = 0; + iter_stats_.num_record_drop_hidden = 0; + iter_stats_.num_record_drop_obsolete = 0; + iter_stats_.num_record_drop_range_del = 0; + iter_stats_.num_range_del_drop_obsolete = 0; + iter_stats_.num_optimized_del_drop_obsolete = 0; +} + +void CompactionIterator::SeekToFirst() { + NextFromInput(); + PrepareOutput(); +} + +void CompactionIterator::Next() { + // If there is a merge output, return it before continuing to process the + // input. + if (merge_out_iter_.Valid()) { + merge_out_iter_.Next(); + + // Check if we returned all records of the merge output. + if (merge_out_iter_.Valid()) { + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to be valid. + if (!s.ok()) { + ROCKS_LOG_FATAL( + info_log_, "Invalid ikey %s in compaction. %s", + allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", + s.getState()); + assert(false); + } + + // Keep current_key_ in sync. + if (0 == timestamp_size_) { + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } else { + Slice ts = ikey_.GetTimestamp(timestamp_size_); + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts); + } + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + validity_info_.SetValid(ValidContext::kMerge1); + } else { + // We consumed all pinned merge operands, release pinned iterators + pinned_iters_mgr_.ReleasePinnedData(); + // MergeHelper moves the iterator to the first record after the merged + // records, so even though we reached the end of the merge output, we do + // not want to advance the iterator. + NextFromInput(); + } + } else { + // Only advance the input iterator if there is no merge output and the + // iterator is not already at the next record. + if (!at_next_) { + AdvanceInputIter(); + } + NextFromInput(); + } + + if (Valid()) { + // Record that we've outputted a record for the current key. + has_outputted_key_ = true; + } + + PrepareOutput(); +} + +bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, + Slice* skip_until) { + // TODO: support compaction filter for wide-column entities + if (!compaction_filter_ || + (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) { + return true; + } + bool error = false; + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. If the return value of the compaction filter is true, + // replace the entry with a deletion marker. + CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined; + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + assert(compaction_filter_); + Slice& filter_key = + (ikey_.type == kTypeValue || + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) + ? ikey_.user_key + : key_; + { + StopWatchNano timer(clock_, report_detailed_time_); + if (kTypeBlobIndex == ikey_.type) { + filter = compaction_filter_->FilterBlobByKey( + level_, filter_key, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + if (CompactionFilter::Decision::kUndetermined == filter && + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + if (compaction_ == nullptr) { + status_ = + Status::Corruption("Unexpected blob index outside of compaction"); + validity_info_.Invalidate(); + return false; + } + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex", + &value_); + + // For integrated BlobDB impl, CompactionIterator reads blob value. + // For Stacked BlobDB impl, the corresponding CompactionFilter's + // FilterV2 method should read the blob value. + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value_); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher_); + + s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index, + prefetch_buffer, &blob_value_, + &bytes_read); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + value_type = CompactionFilter::ValueType::kValue; + } + } + if (CompactionFilter::Decision::kUndetermined == filter) { + filter = compaction_filter_->FilterV2( + level_, filter_key, value_type, + blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + } + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } + + if (CompactionFilter::Decision::kUndetermined == filter) { + // Should not reach here, since FilterV2 should never return kUndetermined. + status_ = + Status::NotSupported("FilterV2() should never return kUndetermined"); + validity_info_.Invalidate(); + return false; + } + + if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2 documentation. + filter = CompactionFilter::Decision::kKeep; + } + + if (filter == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kPurge) { + // convert the current key to a single delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeSingleDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion); + // no value associated with single delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kChangeValue) { + if (ikey_.type == kTypeBlobIndex) { + // value transfer from blob file to inlined data + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) { + // Only the StackableDB-based BlobDB impl's compaction filter should return + // kChangeBlobIndex. Decision about rewriting blob and changing blob index + // in the integrated BlobDB impl is made in subsequent call to + // PrepareOutput() and its callees. + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "Only stacked BlobDB's internal compaction filter can return " + "kChangeBlobIndex."); + validity_info_.Invalidate(); + return false; + } + if (ikey_.type == kTypeValue) { + // value transfer from inlined data to blob file + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kIOError) { + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "CompactionFilter for integrated BlobDB should not return kIOError"); + validity_info_.Invalidate(); + return false; + } + status_ = Status::IOError("Failed to access blob during compaction filter"); + error = true; + } + return !error; +} + +void CompactionIterator::NextFromInput() { + at_next_ = false; + validity_info_.Invalidate(); + + while (!Valid() && input_.Valid() && !IsPausingManualCompaction() && + !IsShuttingDown()) { + key_ = input_.key(); + value_ = input_.value(); + blob_value_.Reset(); + iter_stats_.num_input_records++; + + Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + if (!pik_status.ok()) { + iter_stats_.num_input_corrupt_records++; + + // If `expect_valid_internal_key_` is false, return the corrupted key + // and let the caller decide what to do with it. + if (expect_valid_internal_key_) { + status_ = pik_status; + return; + } + key_ = current_key_.SetInternalKey(key_); + has_current_user_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + validity_info_.SetValid(ValidContext::kParseKeyError); + break; + } + TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); + + // Update input statistics + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || + ikey_.type == kTypeDeletionWithTimestamp) { + iter_stats_.num_input_deletion_records++; + } + iter_stats_.total_input_raw_key_bytes += key_.size(); + iter_stats_.total_input_raw_value_bytes += value_.size(); + + // If need_skip is true, we should seek the input iterator + // to internal key skip_until and continue from there. + bool need_skip = false; + // Points either into compaction_filter_skip_until_ or into + // merge_helper_->compaction_filter_skip_until_. + Slice skip_until; + + bool user_key_equal_without_ts = false; + int cmp_ts = 0; + if (has_current_user_key_) { + user_key_equal_without_ts = + cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_); + // if timestamp_size_ > 0, then curr_ts_ has been initialized by a + // previous key. + cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp( + ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_), + curr_ts_) + : 0; + } + + // Check whether the user key changed. After this if statement current_key_ + // is a copy of the current input key (maybe converted to a delete by the + // compaction filter). ikey_.user_key is pointing to the copy. + if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) { + // First occurrence of this user key + // Copy key for output + key_ = current_key_.SetInternalKey(key_, &ikey_); + + int prev_cmp_with_ts_low = + !full_history_ts_low_ ? 0 + : curr_ts_.empty() + ? 0 + : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_); + + // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use + // in next iteration to compare with the timestamp of next key. + UpdateTimestampAndCompareWithFullHistoryLow(); + + // If + // (1) !has_current_user_key_, OR + // (2) timestamp is disabled, OR + // (3) all history will be preserved, OR + // (4) user key (excluding timestamp) is different from previous key, OR + // (5) timestamp is NO older than *full_history_ts_low_, OR + // (6) timestamp is the largest one older than full_history_ts_low_, + // then current_user_key_ must be treated as a different user key. + // This means, if a user key (excluding ts) is the same as the previous + // user key, and its ts is older than *full_history_ts_low_, then we + // consider this key for GC, e.g. it may be dropped if certain conditions + // match. + if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ || + !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 || + prev_cmp_with_ts_low >= 0) { + // Initialize for future comparison for rule (A) and etc. + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + has_current_user_key_ = true; + } + current_user_key_ = ikey_.user_key; + + has_outputted_key_ = false; + + last_key_seq_zeroed_ = false; + + current_key_committed_ = KeyCommitted(ikey_.sequence); + + // Apply the compaction filter to the first committed version of the user + // key. + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; + } + } else { + // Update the current key to reflect the new sequence number/type without + // copying the user key. + // TODO(rven): Compaction filter does not process keys in this path + // Need to have the compaction filter process multiple versions + // if we have versions on both sides of a snapshot + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + + // Note that newer version of a key is ordered before older versions. If a + // newer version of a key is committed, so as the older version. No need + // to query snapshot_checker_ in that case. + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + current_key_committed_ = KeyCommitted(ikey_.sequence); + // Apply the compaction filter to the first committed version of the + // user key. + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; + } + } + } + + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted); + break; + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find the earliest + // snapshot that is affected by this kv. + SequenceNumber last_sequence = current_user_key_sequence_; + current_user_key_sequence_ = ikey_.sequence; + SequenceNumber last_snapshot = current_user_key_snapshot_; + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + current_user_key_snapshot_ = + visible_at_tip_ + ? earliest_snapshot_ + : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot); + + if (need_skip) { + // This case is handled below. + } else if (clear_and_output_next_key_) { + // In the previous iteration we encountered a single delete that we could + // not compact out. We will keep this Put, but can drop it's data. + // (See Optimization 3, below.) + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && + ikey_.type != kTypeWideColumnEntity) { + ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output", + ikey_.DebugString(allow_data_in_errors_, true).c_str()); + assert(false); + } + if (current_user_key_snapshot_ < last_snapshot) { + ROCKS_LOG_FATAL(info_log_, + "key %s, current_user_key_snapshot_ (%" PRIu64 + ") < last_snapshot (%" PRIu64 ")", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + current_user_key_snapshot_, last_snapshot); + assert(false); + } + + if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) { + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + + value_.clear(); + validity_info_.SetValid(ValidContext::kKeepSDAndClearPut); + clear_and_output_next_key_ = false; + } else if (ikey_.type == kTypeSingleDeletion) { + // We can compact out a SingleDelete if: + // 1) We encounter the corresponding PUT -OR- we know that this key + // doesn't appear past this output level + // =AND= + // 2) We've already returned a record in this snapshot -OR- + // there are no earlier earliest_write_conflict_snapshot. + // + // A note about 2) above: + // we try to determine whether there is any earlier write conflict + // checking snapshot by calling DefinitelyInSnapshot() with seq and + // earliest_write_conflict_snapshot as arguments. For write-prepared + // and write-unprepared transactions, if earliest_write_conflict_snapshot + // is evicted from WritePreparedTxnDB::commit_cache, then + // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns + // false, even if the seq is actually visible within + // earliest_write_conflict_snapshot. Consequently, CompactionIterator + // may try to zero out its sequence number, thus hitting assertion error + // in debug mode or cause incorrect DBIter return result. + // We observe that earliest_write_conflict_snapshot >= earliest_snapshot, + // and the seq zeroing logic depends on + // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot + // determine whether seq is **definitely** in + // earliest_write_conflict_snapshot, then we can additionally check if + // seq is definitely in earliest_snapshot. If the latter holds, then the + // former holds too. + // + // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to + // allow Transactions to do write-conflict checking (if we compacted away + // all keys, then we wouldn't know that a write happened in this + // snapshot). If there is no earlier snapshot, then we know that there + // are no active transactions that need to know about any writes. + // + // Optimization 3: + // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT + // true, then we must output a SingleDelete. In this case, we will decide + // to also output the PUT. While we are compacting less by outputting the + // PUT now, hopefully this will lead to better compaction in the future + // when Rule 2 is later true (Ie, We are hoping we can later compact out + // both the SingleDelete and the Put, while we couldn't if we only + // outputted the SingleDelete now). + // In this case, we can save space by removing the PUT's value as it will + // never be read. + // + // Deletes and Merges are not supported on the same key that has a + // SingleDelete as it is not possible to correctly do any partial + // compaction of such a combination of operations. The result of mixing + // those operations for a given key is documented as being undefined. So + // we can choose how to handle such a combinations of operations. We will + // try to compact out as much as we can in these cases. + // We will report counts on these anomalous cases. + // + // Note: If timestamp is enabled, then record will be eligible for + // deletion, only if, along with above conditions (Rule 1 and Rule 2) + // full_history_ts_low_ is specified and timestamp for that key is less + // than *full_history_ts_low_. If it's not eligible for deletion, then we + // will output the SingleDelete. For Optimization 3 also, if + // full_history_ts_low_ is specified and timestamp for the key is less + // than *full_history_ts_low_ then only optimization will be applied. + + // The easiest way to process a SingleDelete during iteration is to peek + // ahead at the next key. + const bool is_timestamp_eligible_for_gc = + (timestamp_size_ == 0 || + (full_history_ts_low_ && cmp_with_history_ts_low_ < 0)); + + ParsedInternalKey next_ikey; + AdvanceInputIter(); + + // Check whether the next key exists, is not corrupt, and is the same key + // as the single delete. + if (input_.Valid() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:1", + const_cast<Compaction*>(c)); + if (last_key_seq_zeroed_) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + AdvanceInputIter(); + } else if (prev_snapshot == 0 || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) { + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:2", nullptr); + if (next_ikey.type == kTypeSingleDeletion) { + // We encountered two SingleDeletes for same key in a row. This + // could be due to unexpected user input. If write-(un)prepared + // transaction is used, this could also be due to releasing an old + // snapshot between a Put and its matching SingleDelete. + // Skip the first SingleDelete and let the next iteration decide + // how to handle the second SingleDelete. + + // First SingleDelete has been skipped since we already called + // input_.Next(). + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + } else if (next_ikey.type == kTypeDeletion) { + std::ostringstream oss; + oss << "Found SD and type: " << static_cast<int>(next_ikey.type) + << " on the same key, violating the contract " + "of SingleDelete. Check your application to make sure the " + "application does not mix SingleDelete and Delete for " + "the same key. If you are using " + "write-prepared/write-unprepared transactions, and use " + "SingleDelete to delete certain keys, then make sure " + "TransactionDBOptions::rollback_deletion_type_callback is " + "configured properly. Mixing SD and DEL can lead to " + "undefined behaviors"; + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + if (enforce_single_del_contracts_) { + ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str()); + validity_info_.Invalidate(); + status_ = Status::Corruption(oss.str()); + return; + } + ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str()); + } else if (!is_timestamp_eligible_for_gc) { + // We cannot drop the SingleDelete as timestamp is enabled, and + // timestamp of this key is greater than or equal to + // *full_history_ts_low_. We will output the SingleDelete. + validity_info_.SetValid(ValidContext::kKeepTsHistory); + } else if (has_outputted_key_ || + DefinitelyInSnapshot(ikey_.sequence, + earliest_write_conflict_snapshot_) || + (earliest_snapshot_ < earliest_write_conflict_snapshot_ && + DefinitelyInSnapshot(ikey_.sequence, + earliest_snapshot_))) { + // Found a matching value, we can drop the single delete and the + // value. It is safe to drop both records since we've already + // outputted a key in this snapshot, or there is no earlier + // snapshot (Rule 2 above). + + // Note: it doesn't matter whether the second key is a Put or if it + // is an unexpected Merge or Delete. We will compact it out + // either way. We will maintain counts of how many mismatches + // happened + if (next_ikey.type != kTypeValue && + next_ikey.type != kTypeBlobIndex && + next_ikey.type != kTypeWideColumnEntity) { + ++iter_stats_.num_single_del_mismatch; + } + + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + // Already called input_.Next() once. Call it a second time to + // skip past the second key. + AdvanceInputIter(); + } else { + // Found a matching value, but we cannot drop both keys since + // there is an earlier snapshot and we need to leave behind a record + // to know that a write happened in this snapshot (Rule 2 above). + // Clear the value and output the SingleDelete. (The value will be + // outputted on the next iteration.) + + // Setting valid_ to true will output the current SingleDelete + validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck); + + // Set up the Put to be outputted in the next iteration. + // (Optimization 3). + clear_and_output_next_key_ = true; + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:KeepSDForWW", + /*arg=*/nullptr); + } + } else { + // We hit the next snapshot without hitting a put, so the iterator + // returns the single delete. + validity_info_.SetValid(ValidContext::kKeepSDForSnapshot); + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:3", + const_cast<Compaction*>(c)); + } + } else { + // We are at the end of the input, could not parse the next key, or hit + // a different key. The iterator returns the single delete if the key + // possibly exists beyond the current output level. We set + // has_current_user_key to false so that if the iterator is at the next + // key, we do not compare it again against the previous key at the next + // iteration. If the next key is corrupt, we return before the + // comparison, so the value of has_current_user_key does not matter. + has_current_user_key_ = false; + if (compaction_ != nullptr && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_) && + is_timestamp_eligible_for_gc) { + // Key doesn't exist outside of this range. + // Can compact out this SingleDelete. + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_fallthru; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + } else if (last_key_seq_zeroed_) { + // Skip. + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + } else { + // Output SingleDelete + validity_info_.SetValid(ValidContext::kKeepSD); + } + } + + if (Valid()) { + at_next_ = true; + } + } else if (last_snapshot == current_user_key_snapshot_ || + (last_snapshot > 0 && + last_snapshot < current_user_key_snapshot_)) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibility of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // + // Note: Dropping this key will not affect TransactionDB write-conflict + // checking since there has already been a record returned for this key + // in this snapshot. + if (last_sequence < current_user_key_sequence_) { + ROCKS_LOG_FATAL(info_log_, + "key %s, last_sequence (%" PRIu64 + ") < current_user_key_sequence_ (%" PRIu64 ")", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + last_sequence, current_user_key_sequence_); + assert(false); + } + + ++iter_stats_.num_record_drop_hidden; // rule (A) + AdvanceInputIter(); + } else if (compaction_ != nullptr && + (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // TODO(noetzli): This is the only place where we use compaction_ + // (besides the constructor). We should probably get rid of this + // dependency and find a way to do similar filtering during flushes. + // + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + // + // Note: Dropping this Delete will not affect TransactionDB + // write-conflict checking since it is earlier than any snapshot. + // + // It seems that we can also drop deletion later than earliest snapshot + // given that: + // (1) The deletion is earlier than earliest_write_conflict_snapshot, and + // (2) No value exist earlier than the deletion. + // + // Note also that a deletion marker of type kTypeDeletionWithTimestamp + // will be treated as a different user key unless the timestamp is older + // than *full_history_ts_low_. + ++iter_stats_.num_record_drop_obsolete; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + AdvanceInputIter(); + } else if ((ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + bottommost_level_) { + // Handle the case where we have a delete key at the bottom most level + // We can skip outputting the key iff there are no subsequent puts for + // this key + assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( + ikey_.user_key, &level_ptrs_)); + ParsedInternalKey next_ikey; + AdvanceInputIter(); +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:BottommostDelete:1", + const_cast<Compaction*>(c)); + // Skip over all versions of this key that happen to occur in the same + // snapshot range as the delete. + // + // Note that a deletion marker of type kTypeDeletionWithTimestamp will be + // considered to have a different user key unless the timestamp is older + // than *full_history_ts_low_. + while (!IsPausingManualCompaction() && !IsShuttingDown() && + input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && + (prev_snapshot == 0 || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) { + AdvanceInputIter(); + } + // If you find you still need to output a row with this key, we need to + // output the delete too + if (input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + validity_info_.SetValid(ValidContext::kKeepDel); + at_next_ = true; + } + } else if (ikey_.type == kTypeMerge) { + if (!merge_helper_->HasOperator()) { + status_ = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + + pinned_iters_mgr_.StartPinning(); + + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. + Status s = merge_helper_->MergeUntil( + &input_, range_del_agg_, prev_snapshot, bottommost_level_, + allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_, + prefetch_buffers_.get(), &iter_stats_); + merge_out_iter_.SeekToFirst(); + + if (!s.ok() && !s.IsMergeInProgress()) { + status_ = s; + return; + } else if (merge_out_iter_.Valid()) { + // NOTE: key, value, and ikey_ refer to old entries. + // These will be correctly set below. + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to valid. + if (!pik_status.ok()) { + ROCKS_LOG_FATAL( + info_log_, "Invalid key %s in compaction. %s", + allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", + pik_status.getState()); + assert(false); + } + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + validity_info_.SetValid(ValidContext::kMerge2); + } else { + // all merge operands were filtered out. reset the user key, since the + // batch consumed by the merge operator should not shadow any keys + // coming after the merges + has_current_user_key_ = false; + pinned_iters_mgr_.ReleasePinnedData(); + + if (merge_helper_->FilteredUntil(&skip_until)) { + need_skip = true; + } + } + } else { + // 1. new user key -OR- + // 2. different snapshot stripe + // If user-defined timestamp is enabled, we consider keys for GC if they + // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete() + // only considers range deletions that are at or below history_ts_low_ and + // trim_ts_. We drop keys here that are below history_ts_low_ and are + // covered by a range tombstone that is at or below history_ts_low_ and + // trim_ts. + bool should_delete = false; + if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) { + should_delete = range_del_agg_->ShouldDelete( + key_, RangeDelPositioningMode::kForwardTraversal); + } + if (should_delete) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_range_del; + AdvanceInputIter(); + } else { + validity_info_.SetValid(ValidContext::kNewUserKey); + } + } + + if (need_skip) { + SkipUntil(skip_until); + } + } + + if (!Valid() && IsShuttingDown()) { + status_ = Status::ShutdownInProgress(); + } + + if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + // Propagate corruption status from memtable itereator + if (!input_.Valid() && input_.status().IsCorruption()) { + status_ = input_.status(); + } +} + +bool CompactionIterator::ExtractLargeValueIfNeededImpl() { + if (!blob_file_builder_) { + return false; + } + + blob_index_.clear(); + const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return false; + } + + if (blob_index_.empty()) { + return false; + } + + value_ = blob_index_; + + return true; +} + +void CompactionIterator::ExtractLargeValueIfNeeded() { + assert(ikey_.type == kTypeValue); + + if (!ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); +} + +void CompactionIterator::GarbageCollectBlobIfNeeded() { + assert(ikey_.type == kTypeBlobIndex); + + if (!compaction_) { + return; + } + + // GC for integrated BlobDB + if (compaction_->enable_blob_garbage_collection()) { + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex", + &value_); + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value_); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return; + } + } + + if (blob_index.file_number() >= + blob_garbage_collection_cutoff_file_number_) { + return; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + { + assert(blob_fetcher_); + + const Status s = blob_fetcher_->FetchBlob( + user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + + return; + } + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + ++iter_stats_.num_blobs_relocated; + iter_stats_.total_blob_bytes_relocated += blob_index.size(); + + value_ = blob_value_; + + if (ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + + return; + } + + // GC for stacked BlobDB + if (compaction_filter_ && + compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = + Status::Corruption("Corrupted blob reference encountered during GC"); + validity_info_.Invalidate(); + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + validity_info_.Invalidate(); + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + + return; + } + } +} + +void CompactionIterator::DecideOutputLevel() { + assert(compaction_->SupportsPerKeyPlacement()); +#ifndef NDEBUG + // Could be overridden by unittest + PerKeyPlacementContext context(level_, ikey_.user_key, value_, + ikey_.sequence); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + output_to_penultimate_level_ = context.output_to_penultimate_level; +#else + output_to_penultimate_level_ = false; +#endif // NDEBUG + + // if the key is newer than the cutoff sequence or within the earliest + // snapshot, it should output to the penultimate level. + if (ikey_.sequence > preclude_last_level_min_seqno_ || + ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } + + if (output_to_penultimate_level_) { + // If it's decided to output to the penultimate level, but unsafe to do so, + // still output to the last level. For example, moving the data from a lower + // level to a higher level outside of the higher-level input key range is + // considered unsafe, because the key may conflict with higher-level SSTs + // not from this compaction. + // TODO: add statistic for declined output_to_penultimate_level + bool safe_to_penultimate_level = + compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key); + if (!safe_to_penultimate_level) { + output_to_penultimate_level_ = false; + // It could happen when disable/enable `last_level_temperature` while + // holding a snapshot. When `last_level_temperature` is not set + // (==kUnknown), the data newer than any snapshot is pushed to the last + // level, but when the per_key_placement feature is enabled on the fly, + // the data later than the snapshot has to be moved to the penultimate + // level, which may or may not be safe. So the user needs to make sure all + // snapshot is released before enabling `last_level_temperature` feature + // We will migrate the feature to `last_level_temperature` and maybe make + // it not dynamically changeable. + if (ikey_.sequence > earliest_snapshot_) { + status_ = Status::Corruption( + "Unsafe to store Seq later than snapshot in the last level if " + "per_key_placement is enabled"); + } + } + } +} + +void CompactionIterator::PrepareOutput() { + if (Valid()) { + if (ikey_.type == kTypeValue) { + ExtractLargeValueIfNeeded(); + } else if (ikey_.type == kTypeBlobIndex) { + GarbageCollectBlobIfNeeded(); + } + + if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { + DecideOutputLevel(); + } + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // and the userkey differs from the last userkey in compaction + // then we can squash the seqno to zero. + // + // This is safe for TransactionDB write-conflict checking since transactions + // only care about sequence number larger than any active snapshots. + // + // Can we do the same for levels above bottom level as long as + // KeyNotExistsBeyondOutputLevel() return true? + if (Valid() && compaction_ != nullptr && + !compaction_->allow_ingest_behind() && bottommost_level_ && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + ikey_.type != kTypeMerge && current_key_committed_ && + !output_to_penultimate_level_ && + ikey_.sequence < preserve_time_min_seqno_) { + if (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { + ROCKS_LOG_FATAL( + info_log_, + "Unexpected key %s for seq-zero optimization. " + "earliest_snapshot %" PRIu64 + ", earliest_write_conflict_snapshot %" PRIu64 + " job_snapshot %" PRIu64 + ". timestamp_size: %d full_history_ts_low_ %s. validity %x", + ikey_.DebugString(allow_data_in_errors_, true).c_str(), + earliest_snapshot_, earliest_write_conflict_snapshot_, + job_snapshot_, static_cast<int>(timestamp_size_), + full_history_ts_low_ != nullptr + ? Slice(*full_history_ts_low_).ToString(true).c_str() + : "null", + validity_info_.rep); + assert(false); + } + ikey_.sequence = 0; + last_key_seq_zeroed_ = true; + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq", + &ikey_); + if (!timestamp_size_) { + current_key_.UpdateInternalKey(0, ikey_.type); + } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) { + // We can also zero out timestamp for better compression. + // For the same user key (excluding timestamp), the timestamp-based + // history can be collapsed to save some space if the timestamp is + // older than *full_history_ts_low_. + const std::string kTsMin(timestamp_size_, static_cast<char>(0)); + const Slice ts_slice = kTsMin; + ikey_.SetTimestamp(ts_slice); + current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice); + } + } + } +} + +inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot) { + assert(snapshots_->size()); + if (snapshots_->size() == 0) { + ROCKS_LOG_FATAL(info_log_, + "No snapshot left in findEarliestVisibleSnapshot"); + } + auto snapshots_iter = + std::lower_bound(snapshots_->begin(), snapshots_->end(), in); + assert(prev_snapshot != nullptr); + if (snapshots_iter == snapshots_->begin()) { + *prev_snapshot = 0; + } else { + *prev_snapshot = *std::prev(snapshots_iter); + if (*prev_snapshot >= in) { + ROCKS_LOG_FATAL(info_log_, + "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64 + ") in findEarliestVisibleSnapshot", + *prev_snapshot, in); + assert(false); + } + } + if (snapshot_checker_ == nullptr) { + return snapshots_iter != snapshots_->end() ? *snapshots_iter + : kMaxSequenceNumber; + } + bool has_released_snapshot = !released_snapshots_.empty(); + for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { + auto cur = *snapshots_iter; + if (in > cur) { + ROCKS_LOG_FATAL(info_log_, + "in (%" PRIu64 ") > cur (%" PRIu64 + ") in findEarliestVisibleSnapshot", + in, cur); + assert(false); + } + // Skip if cur is in released_snapshots. + if (has_released_snapshot && released_snapshots_.count(cur) > 0) { + continue; + } + auto res = snapshot_checker_->CheckInSnapshot(in, cur); + if (res == SnapshotCheckerResult::kInSnapshot) { + return cur; + } else if (res == SnapshotCheckerResult::kSnapshotReleased) { + released_snapshots_.insert(cur); + } + *prev_snapshot = cur; + } + return kMaxSequenceNumber; +} + +uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction) { + if (!compaction) { + return 0; + } + + if (!compaction->enable_blob_garbage_collection()) { + return 0; + } + + const Version* const version = compaction->input_version(); + assert(version); + + const VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + const size_t cutoff_index = static_cast<size_t>( + compaction->blob_garbage_collection_age_cutoff() * blob_files.size()); + + if (cutoff_index >= blob_files.size()) { + return std::numeric_limits<uint64_t>::max(); + } + + const auto& meta = blob_files[cutoff_index]; + assert(meta); + + return meta->GetBlobFileNumber(); +} + +std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + const Version* const version = compaction->input_version(); + if (!version) { + return nullptr; + } + + ReadOptions read_options; + read_options.fill_cache = false; + + return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options)); +} + +std::unique_ptr<PrefetchBufferCollection> +CompactionIterator::CreatePrefetchBufferCollectionIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + if (!compaction->input_version()) { + return nullptr; + } + + if (compaction->allow_mmap_reads()) { + return nullptr; + } + + const uint64_t readahead_size = compaction->blob_compaction_readahead_size(); + if (!readahead_size) { + return nullptr; + } + + return std::unique_ptr<PrefetchBufferCollection>( + new PrefetchBufferCollection(readahead_size)); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h new file mode 100644 index 000000000..c215d2bbb --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.h @@ -0,0 +1,513 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <algorithm> +#include <cinttypes> +#include <deque> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileBuilder; +class BlobFetcher; +class PrefetchBufferCollection; + +// A wrapper of internal iterator whose purpose is to count how +// many entries there are in the iterator. +class SequenceIterWrapper : public InternalIterator { + public: + SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp, + bool need_count_entries) + : icmp_(cmp), + inner_iter_(iter), + need_count_entries_(need_count_entries) {} + bool Valid() const override { return inner_iter_->Valid(); } + Status status() const override { return inner_iter_->status(); } + void Next() override { + num_itered_++; + inner_iter_->Next(); + } + void Seek(const Slice& target) override { + if (!need_count_entries_) { + inner_iter_->Seek(target); + } else { + // For flush cases, we need to count total number of entries, so we + // do Next() rather than Seek(). + while (inner_iter_->Valid() && + icmp_.Compare(inner_iter_->key(), target) < 0) { + Next(); + } + } + } + Slice key() const override { return inner_iter_->key(); } + Slice value() const override { return inner_iter_->value(); } + + // Unused InternalIterator methods + void SeekToFirst() override { assert(false); } + void Prev() override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + uint64_t num_itered() const { return num_itered_; } + + private: + InternalKeyComparator icmp_; + InternalIterator* inner_iter_; // not owned + uint64_t num_itered_ = 0; + bool need_count_entries_; +}; + +class CompactionIterator { + public: + // A wrapper around Compaction. Has a much smaller interface, only what + // CompactionIterator uses. Tests can override it. + class CompactionProxy { + public: + virtual ~CompactionProxy() = default; + + virtual int level() const = 0; + + virtual bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0; + + virtual bool bottommost_level() const = 0; + + virtual int number_levels() const = 0; + + // Result includes timestamp if user-defined timestamp is enabled. + virtual Slice GetLargestUserKey() const = 0; + + virtual bool allow_ingest_behind() const = 0; + + virtual bool allow_mmap_reads() const = 0; + + virtual bool enable_blob_garbage_collection() const = 0; + + virtual double blob_garbage_collection_age_cutoff() const = 0; + + virtual uint64_t blob_compaction_readahead_size() const = 0; + + virtual const Version* input_version() const = 0; + + virtual bool DoesInputReferenceBlobFiles() const = 0; + + virtual const Compaction* real_compaction() const = 0; + + virtual bool SupportsPerKeyPlacement() const = 0; + + // `key` includes timestamp if user-defined timestamp is enabled. + virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0; + }; + + class RealCompaction : public CompactionProxy { + public: + explicit RealCompaction(const Compaction* compaction) + : compaction_(compaction) { + assert(compaction_); + assert(compaction_->immutable_options()); + assert(compaction_->mutable_cf_options()); + } + + int level() const override { return compaction_->level(); } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector<size_t>* level_ptrs) const override { + return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); + } + + bool bottommost_level() const override { + return compaction_->bottommost_level(); + } + + int number_levels() const override { return compaction_->number_levels(); } + + // Result includes timestamp if user-defined timestamp is enabled. + Slice GetLargestUserKey() const override { + return compaction_->GetLargestUserKey(); + } + + bool allow_ingest_behind() const override { + return compaction_->immutable_options()->allow_ingest_behind; + } + + bool allow_mmap_reads() const override { + return compaction_->immutable_options()->allow_mmap_reads; + } + + bool enable_blob_garbage_collection() const override { + return compaction_->enable_blob_garbage_collection(); + } + + double blob_garbage_collection_age_cutoff() const override { + return compaction_->blob_garbage_collection_age_cutoff(); + } + + uint64_t blob_compaction_readahead_size() const override { + return compaction_->mutable_cf_options()->blob_compaction_readahead_size; + } + + const Version* input_version() const override { + return compaction_->input_version(); + } + + bool DoesInputReferenceBlobFiles() const override { + return compaction_->DoesInputReferenceBlobFiles(); + } + + const Compaction* real_compaction() const override { return compaction_; } + + bool SupportsPerKeyPlacement() const override { + return compaction_->SupportsPerKeyPlacement(); + } + + // Check if key is within penultimate level output range, to see if it's + // safe to output to the penultimate level for per_key_placement feature. + // `key` includes timestamp if user-defined timestamp is enabled. + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return compaction_->WithinPenultimateLevelOutputRange(key); + } + + private: + const Compaction* compaction_; + }; + + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic<bool>& manual_compaction_canceled, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic<bool>* shutting_down = nullptr, + const std::shared_ptr<Logger> info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, + const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + + // Constructor with custom CompactionProxy, used for tests. + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, + Env* env, bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic<bool>& manual_compaction_canceled, + std::unique_ptr<CompactionProxy> compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic<bool>* shutting_down = nullptr, + const std::shared_ptr<Logger> info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, + const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + + ~CompactionIterator(); + + void ResetRecordCounts(); + + // Seek to the beginning of the compaction iterator output. + // + // REQUIRED: Call only once. + void SeekToFirst(); + + // Produces the next record in the compaction. + // + // REQUIRED: SeekToFirst() has been called. + void Next(); + + // Getters + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + const Status& status() const { return status_; } + const ParsedInternalKey& ikey() const { return ikey_; } + inline bool Valid() const { return validity_info_.IsValid(); } + const Slice& user_key() const { return current_user_key_; } + const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint64_t num_input_entry_scanned() const { return input_.num_itered(); } + // If the current key should be placed on penultimate level, only valid if + // per_key_placement is supported + bool output_to_penultimate_level() const { + return output_to_penultimate_level_; + } + Status InputStatus() const { return input_.status(); } + + private: + // Processes the input stream to find the next output + void NextFromInput(); + + // Do final preparations before presenting the output to the callee. + void PrepareOutput(); + + // Decide the current key should be output to the last level or penultimate + // level, only call for compaction supports per key placement + void DecideOutputLevel(); + + // Passes the output value to the blob file builder (if any), and replaces it + // with the corresponding blob reference if it has been actually written to a + // blob file (i.e. if it passed the value size check). Returns true if the + // value got extracted to a blob file, false otherwise. + bool ExtractLargeValueIfNeededImpl(); + + // Extracts large values as described above, and updates the internal key's + // type to kTypeBlobIndex if the value got extracted. Should only be called + // for regular values (kTypeValue). + void ExtractLargeValueIfNeeded(); + + // Relocates valid blobs residing in the oldest blob files if garbage + // collection is enabled. Relocated blobs are written to new blob files or + // inlined in the LSM tree depending on the current settings (i.e. + // enable_blob_files and min_blob_size). Should only be called for blob + // references (kTypeBlobIndex). + // + // Note: the stacked BlobDB implementation's compaction filter based GC + // algorithm is also called from here. + void GarbageCollectBlobIfNeeded(); + + // Invoke compaction filter if needed. + // Return true on success, false on failures (e.g.: kIOError). + bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); + + // Given a sequence number, return the sequence number of the + // earliest snapshot that this sequence number is visible in. + // The snapshots themselves are arranged in ascending order of + // sequence numbers. + // Employ a sequential search because the total number of + // snapshots are typically small. + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot); + + inline bool KeyCommitted(SequenceNumber sequence) { + return snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) == + SnapshotCheckerResult::kInSnapshot; + } + + bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + // Extract user-defined timestamp from user key if possible and compare it + // with *full_history_ts_low_ if applicable. + inline void UpdateTimestampAndCompareWithFullHistoryLow() { + if (!timestamp_size_) { + return; + } + Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_); + curr_ts_.assign(ts.data(), ts.size()); + if (full_history_ts_low_) { + cmp_with_history_ts_low_ = + cmp_->CompareTimestamp(ts, *full_history_ts_low_); + } + } + + static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction); + static std::unique_ptr<BlobFetcher> CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction); + static std::unique_ptr<PrefetchBufferCollection> + CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction); + + SequenceIterWrapper input_; + const Comparator* cmp_; + MergeHelper* merge_helper_; + const std::vector<SequenceNumber>* snapshots_; + // List of snapshots released during compaction. + // findEarliestVisibleSnapshot() find them out from return of + // snapshot_checker, and make sure they will not be returned as + // earliest visible snapshot of an older value. + // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3. + std::unordered_set<SequenceNumber> released_snapshots_; + const SequenceNumber earliest_write_conflict_snapshot_; + const SequenceNumber job_snapshot_; + const SnapshotChecker* const snapshot_checker_; + Env* env_; + SystemClock* clock_; + const bool report_detailed_time_; + const bool expect_valid_internal_key_; + CompactionRangeDelAggregator* range_del_agg_; + BlobFileBuilder* blob_file_builder_; + std::unique_ptr<CompactionProxy> compaction_; + const CompactionFilter* compaction_filter_; + const std::atomic<bool>* shutting_down_; + const std::atomic<bool>& manual_compaction_canceled_; + const bool bottommost_level_; + const bool visible_at_tip_; + const SequenceNumber earliest_snapshot_; + + std::shared_ptr<Logger> info_log_; + + const bool allow_data_in_errors_; + + const bool enforce_single_del_contracts_; + + // Comes from comparator. + const size_t timestamp_size_; + + // Lower bound timestamp to retain full history in terms of user-defined + // timestamp. If a key's timestamp is older than full_history_ts_low_, then + // the key *may* be eligible for garbage collection (GC). The skipping logic + // is in `NextFromInput()` and `PrepareOutput()`. + // If nullptr, NO GC will be performed and all history will be preserved. + const std::string* const full_history_ts_low_; + + // State + // + enum ValidContext : uint8_t { + kMerge1 = 0, + kMerge2 = 1, + kParseKeyError = 2, + kCurrentKeyUncommitted = 3, + kKeepSDAndClearPut = 4, + kKeepTsHistory = 5, + kKeepSDForConflictCheck = 6, + kKeepSDForSnapshot = 7, + kKeepSD = 8, + kKeepDel = 9, + kNewUserKey = 10, + }; + + struct ValidityInfo { + inline bool IsValid() const { return rep & 1; } + ValidContext GetContext() const { + return static_cast<ValidContext>(rep >> 1); + } + inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; } + inline void Invalidate() { rep = 0; } + + uint8_t rep{0}; + } validity_info_; + + // Points to a copy of the current compaction iterator output (current_key_) + // if valid. + Slice key_; + // Points to the value in the underlying iterator that corresponds to the + // current output. + Slice value_; + // The status is OK unless compaction iterator encounters a merge operand + // while not having a merge operator defined. + Status status_; + // Stores the user key, sequence number and type of the current compaction + // iterator output (or current key in the underlying iterator during + // NextFromInput()). + ParsedInternalKey ikey_; + // Stores whether ikey_.user_key is valid. If set to false, the user key is + // not compared against the current key in the underlying iterator. + bool has_current_user_key_ = false; + // If false, the iterator holds a copy of the current compaction iterator + // output (or current key in the underlying iterator during NextFromInput()). + bool at_next_ = false; + + IterKey current_key_; + Slice current_user_key_; + std::string curr_ts_; + SequenceNumber current_user_key_sequence_; + SequenceNumber current_user_key_snapshot_; + + // True if the iterator has already returned a record for the current key. + bool has_outputted_key_ = false; + + // truncated the value of the next key and output it without applying any + // compaction rules. This is used for outputting a put after a single delete. + bool clear_and_output_next_key_ = false; + + MergeOutputIterator merge_out_iter_; + // PinnedIteratorsManager used to pin input_ Iterator blocks while reading + // merge operands and then releasing them after consuming them. + PinnedIteratorsManager pinned_iters_mgr_; + + uint64_t blob_garbage_collection_cutoff_file_number_; + + std::unique_ptr<BlobFetcher> blob_fetcher_; + std::unique_ptr<PrefetchBufferCollection> prefetch_buffers_; + + std::string blob_index_; + PinnableSlice blob_value_; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + // "level_ptrs" holds indices that remember which file of an associated + // level we were last checking during the last call to compaction-> + // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function + // to pick off where it left off since each subcompaction's key range is + // increasing so a later call to the function must be looking for a key that + // is in or beyond the last file checked during the previous call + std::vector<size_t> level_ptrs_; + CompactionIterationStats iter_stats_; + + // Used to avoid purging uncommitted values. The application can specify + // uncommitted values by providing a SnapshotChecker object. + bool current_key_committed_; + + // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_) + int cmp_with_history_ts_low_; + + const int level_; + + // True if the previous internal key (same user key)'s sequence number has + // just been zeroed out during bottommost compaction. + bool last_key_seq_zeroed_{false}; + + // True if the current key should be output to the penultimate level if + // possible, compaction logic makes the final decision on which level to + // output to. + bool output_to_penultimate_level_{false}; + + // min seqno for preserving the time information. + const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + + // min seqno to preclude the data from the last level, if the key seqno larger + // than this, it will be output to penultimate level + const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + + void AdvanceInputIter() { input_.Next(); } + + void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } + + bool IsPausingManualCompaction() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return manual_compaction_canceled_.load(std::memory_order_relaxed); + } +}; + +inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, + SequenceNumber snapshot) { + return ((seq) <= (snapshot) && + (snapshot_checker_ == nullptr || + LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kInSnapshot))); +} + +inline bool CompactionIterator::DefinitelyNotInSnapshot( + SequenceNumber seq, SequenceNumber snapshot) { + return ((seq) > (snapshot) || + (snapshot_checker_ != nullptr && + UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kNotInSnapshot))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc new file mode 100644 index 000000000..81362d792 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc @@ -0,0 +1,1618 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/compaction_iterator.h" + +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "port/port.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "util/vector_iterator.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// Expects no merging attempts. +class NoMergingMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + ADD_FAILURE(); + return false; + } + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque<Slice>& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + ADD_FAILURE(); + return false; + } + const char* Name() const override { + return "CompactionIteratorTest NoMergingMergeOp"; + } +}; + +// Compaction filter that gets stuck when it sees a particular key, +// then gets unstuck when told to. +// Always returns Decision::kRemove. +class StallingFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int k = std::atoi(key.ToString().c_str()); + last_seen.store(k); + while (k >= stall_at.load()) { + std::this_thread::yield(); + } + return Decision::kRemove; + } + + const char* Name() const override { + return "CompactionIteratorTest StallingFilter"; + } + + // Wait until the filter sees a key >= k and stalls at that key. + // If `exact`, asserts that the seen key is equal to k. + void WaitForStall(int k, bool exact = true) { + stall_at.store(k); + while (last_seen.load() < k) { + std::this_thread::yield(); + } + if (exact) { + EXPECT_EQ(k, last_seen.load()); + } + } + + // Filter will stall on key >= stall_at. Advance stall_at to unstall. + mutable std::atomic<int> stall_at{0}; + // Last key the filter was called with. + mutable std::atomic<int> last_seen{0}; +}; + +// Compaction filter that filter out all keys. +class FilterAllKeysCompactionFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return Decision::kRemove; + } + + const char* Name() const override { return "AllKeysCompactionFilter"; } +}; + +class LoggingForwardVectorIterator : public VectorIterator { + public: + struct Action { + enum class Type { + SEEK_TO_FIRST, + SEEK, + NEXT, + }; + + Type type; + std::string arg; + + explicit Action(Type _type, std::string _arg = "") + : type(_type), arg(_arg) {} + + bool operator==(const Action& rhs) const { + return std::tie(type, arg) == std::tie(rhs.type, rhs.arg); + } + }; + + LoggingForwardVectorIterator(const std::vector<std::string>& keys, + const std::vector<std::string>& values) + : VectorIterator(keys, values) { + current_ = keys_.size(); + } + + void SeekToFirst() override { + log.emplace_back(Action::Type::SEEK_TO_FIRST); + VectorIterator::SeekToFirst(); + } + void SeekToLast() override { assert(false); } + + void Seek(const Slice& target) override { + log.emplace_back(Action::Type::SEEK, target.ToString()); + VectorIterator::Seek(target); + } + + void SeekForPrev(const Slice& /*target*/) override { assert(false); } + + void Next() override { + assert(Valid()); + log.emplace_back(Action::Type::NEXT); + VectorIterator::Next(); + } + void Prev() override { assert(false); } + + Slice key() const override { + assert(Valid()); + return VectorIterator::key(); + } + Slice value() const override { + assert(Valid()); + return VectorIterator::value(); + } + + std::vector<Action> log; +}; + +class FakeCompaction : public CompactionIterator::CompactionProxy { + public: + int level() const override { return 0; } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& /*user_key*/, + std::vector<size_t>* /*level_ptrs*/) const override { + return is_bottommost_level || key_not_exists_beyond_output_level; + } + + bool bottommost_level() const override { return is_bottommost_level; } + + int number_levels() const override { return 1; } + + Slice GetLargestUserKey() const override { + return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + } + + bool allow_ingest_behind() const override { return is_allow_ingest_behind; } + + bool allow_mmap_reads() const override { return false; } + + bool enable_blob_garbage_collection() const override { return false; } + + double blob_garbage_collection_age_cutoff() const override { return 0.0; } + + uint64_t blob_compaction_readahead_size() const override { return 0; } + + const Version* input_version() const override { return nullptr; } + + bool DoesInputReferenceBlobFiles() const override { return false; } + + const Compaction* real_compaction() const override { return nullptr; } + + bool SupportsPerKeyPlacement() const override { + return supports_per_key_placement; + } + + bool WithinPenultimateLevelOutputRange(const Slice& key) const override { + return (!key.starts_with("unsafe_pb")); + } + + bool key_not_exists_beyond_output_level = false; + + bool is_bottommost_level = false; + + bool is_allow_ingest_behind = false; + + bool supports_per_key_placement = false; +}; + +// A simplified snapshot checker which assumes each snapshot has a global +// last visible sequence. +class TestSnapshotChecker : public SnapshotChecker { + public: + explicit TestSnapshotChecker( + SequenceNumber last_committed_sequence, + const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = + {{}}) + : last_committed_sequence_(last_committed_sequence), + snapshots_(snapshots) {} + + SnapshotCheckerResult CheckInSnapshot( + SequenceNumber seq, SequenceNumber snapshot_seq) const override { + if (snapshot_seq == kMaxSequenceNumber) { + return seq <= last_committed_sequence_ + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + assert(snapshots_.count(snapshot_seq) > 0); + return seq <= snapshots_.at(snapshot_seq) + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + + private: + SequenceNumber last_committed_sequence_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map<SequenceNumber, SequenceNumber> snapshots_; +}; + +// Test param: +// bool: whether to pass snapshot_checker to compaction iterator. +class CompactionIteratorTest : public testing::TestWithParam<bool> { + public: + CompactionIteratorTest() + : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {} + + explicit CompactionIteratorTest(const Comparator* ucmp) + : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {} + + void InitIterators( + const std::vector<std::string>& ks, const std::vector<std::string>& vs, + const std::vector<std::string>& range_del_ks, + const std::vector<std::string>& range_del_vs, + SequenceNumber last_sequence, + SequenceNumber last_committed_sequence = kMaxSequenceNumber, + MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { + std::unique_ptr<InternalIterator> unfragmented_range_del_iter( + new VectorIterator(range_del_ks, range_del_vs, &icmp_)); + auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>( + std::move(unfragmented_range_del_iter), icmp_); + std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( + new FragmentedRangeTombstoneIterator(tombstone_list, icmp_, + kMaxSequenceNumber)); + range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_)); + range_del_agg_->AddTombstones(std::move(range_del_iter)); + + std::unique_ptr<CompactionIterator::CompactionProxy> compaction; + if (filter || bottommost_level || key_not_exists_beyond_output_level) { + compaction_proxy_ = new FakeCompaction(); + compaction_proxy_->is_bottommost_level = bottommost_level; + compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind(); + compaction_proxy_->key_not_exists_beyond_output_level = + key_not_exists_beyond_output_level; + compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement(); + compaction.reset(compaction_proxy_); + } + bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); + if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) { + snapshot_checker_.reset( + new TestSnapshotChecker(last_committed_sequence, snapshot_map_)); + } + merge_helper_.reset( + new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false, + 0 /*latest_snapshot*/, snapshot_checker_.get(), + 0 /*level*/, nullptr /*statistics*/, &shutting_down_)); + + if (c_iter_) { + // Since iter_ is still used in ~CompactionIterator(), we call + // ~CompactionIterator() first. + c_iter_.reset(); + } + iter_.reset(new LoggingForwardVectorIterator(ks, vs)); + iter_->SeekToFirst(); + c_iter_.reset(new CompactionIterator( + iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, + earliest_write_conflict_snapshot, kMaxSequenceNumber, + snapshot_checker_.get(), Env::Default(), + false /* report_detailed_time */, false, range_del_agg_.get(), + nullptr /* blob_file_builder */, true /*allow_data_in_errors*/, + true /*enforce_single_del_contracts*/, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_, + std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr, + full_history_ts_low)); + } + + void AddSnapshot(SequenceNumber snapshot, + SequenceNumber last_visible_seq = kMaxSequenceNumber) { + snapshots_.push_back(snapshot); + snapshot_map_[snapshot] = last_visible_seq; + } + + virtual bool UseSnapshotChecker() const { return false; } + + virtual bool AllowIngestBehind() const { return false; } + + virtual bool SupportsPerKeyPlacement() const { return false; } + + void RunTest( + const std::vector<std::string>& input_keys, + const std::vector<std::string>& input_values, + const std::vector<std::string>& expected_keys, + const std::vector<std::string>& expected_values, + SequenceNumber last_committed_seq = kMaxSequenceNumber, + MergeOperator* merge_operator = nullptr, + CompactionFilter* compaction_filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { + InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber, + last_committed_seq, merge_operator, compaction_filter, + bottommost_level, earliest_write_conflict_snapshot, + key_not_exists_beyond_output_level, full_history_ts_low); + c_iter_->SeekToFirst(); + for (size_t i = 0; i < expected_keys.size(); i++) { + std::string info = "i = " + std::to_string(i); + ASSERT_TRUE(c_iter_->Valid()) << info; + ASSERT_OK(c_iter_->status()) << info; + ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info; + ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info; + c_iter_->Next(); + } + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); + } + + void ClearSnapshots() { + snapshots_.clear(); + snapshot_map_.clear(); + } + + const Comparator* cmp_; + const InternalKeyComparator icmp_; + std::vector<SequenceNumber> snapshots_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_; + std::unique_ptr<MergeHelper> merge_helper_; + std::unique_ptr<LoggingForwardVectorIterator> iter_; + std::unique_ptr<CompactionIterator> c_iter_; + std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_; + std::unique_ptr<SnapshotChecker> snapshot_checker_; + std::atomic<bool> shutting_down_{false}; + const std::atomic<bool> kManualCompactionCanceledFalse_{false}; + FakeCompaction* compaction_proxy_; +}; + +// It is possible that the output of the compaction iterator is empty even if +// the input is not. +TEST_P(CompactionIteratorTest, EmptyResult) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue)}, + {"", "val"}, {}, {}, 5); + c_iter_->SeekToFirst(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +// If there is a corruption after a single deletion, the corrupted key should +// be preserved. +TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue, true), + test::KeyStr("b", 10, kTypeValue)}, + {"", "val", "val2"}, {}, {}, 10); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion), + c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, SimpleRangeDeletion) { + InitIterators({test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("morning", 2, kTypeValue), + test::KeyStr("night", 3, kTypeValue)}, + {"zao", "zao", "wan"}, + {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) { + AddSnapshot(10); + std::vector<std::string> ks1; + ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion)); + std::vector<std::string> vs1{"mz"}; + std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue), + test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("night", 40, kTypeValue), + test::KeyStr("night", 20, kTypeValue)}; + std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"}; + InitIterators(ks2, vs2, ks1, vs1, 40); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* skip_until) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("av50", v); + return Decision::kKeep; + } + if (k == "b") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("bv60", v); + *skip_until = "d+"; + return Decision::kRemoveAndSkipUntil; + } + if (k == "e") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("em71", v); + return Decision::kKeep; + } + if (k == "f") { + if (v == "fm65") { + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "f"; + } else { + EXPECT_EQ("fm30", v); + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "g+"; + } + return Decision::kRemoveAndSkipUntil; + } + if (k == "h") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("hv91", v); + return Decision::kKeep; + } + if (k == "i") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("im95", v); + *skip_until = "z"; + return Decision::kRemoveAndSkipUntil; + } + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter"; + } + }; + + NoMergingMergeOp merge_op; + Filter filter; + InitIterators( + {test::KeyStr("a", 50, kTypeValue), // keep + test::KeyStr("a", 45, kTypeMerge), + test::KeyStr("b", 60, kTypeValue), // skip to "d+" + test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue), + test::KeyStr("d", 70, kTypeMerge), + test::KeyStr("e", 71, kTypeMerge), // keep + test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep + test::KeyStr("f", 30, kTypeMerge), // skip to "g+" + test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue), + test::KeyStr("h", 91, kTypeValue), // keep + test::KeyStr("i", 95, kTypeMerge), // skip to "z" + test::KeyStr("j", 99, kTypeValue)}, + {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30", + "fv25", "gv90", "hv91", "im95", "jv99"}, + {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter); + + // Compaction should output just "a", "e" and "h" keys. + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("av50", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("em71", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("hv91", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); + + // Check that the compaction iterator did the correct sequence of calls on + // the underlying iterator. + using A = LoggingForwardVectorIterator::Action; + using T = A::Type; + std::vector<A> expected_actions = { + A(T::SEEK_TO_FIRST), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))}; + ASSERT_EQ(expected_actions, iter_->log); +} + +TEST_P(CompactionIteratorTest, ShuttingDownInFilter) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue), + test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + // Don't leave tombstones (kTypeDeletion) for filtered keys. + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic<bool> seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + EXPECT_FALSE(c_iter_->Valid()); + EXPECT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +// Same as ShuttingDownInFilter, but shutdown happens during filter call for +// a merge operand, not for a value. +TEST_P(CompactionIteratorTest, ShuttingDownInMerge) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge), + test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic<bool> seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + ASSERT_FALSE(c_iter_->Valid()); + ASSERT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +TEST_P(CompactionIteratorTest, SingleMergeOperand) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("av1", v); + return Decision::kKeep; + } else if (k == "b") { + EXPECT_EQ(ValueType::kMergeOperand, t); + return Decision::kKeep; + } else if (k == "c") { + return Decision::kKeep; + } + + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.SingleMergeOperand::Filter"; + } + }; + + class SingleMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + // See InitIterators() call below for why "c" is the only key for which + // FullMergeV2 should be called. + EXPECT_EQ("c", merge_in.key.ToString()); + + std::string temp_value; + if (merge_in.existing_value != nullptr) { + temp_value = merge_in.existing_value->ToString(); + } + + for (auto& operand : merge_in.operand_list) { + temp_value.append(operand.ToString()); + } + merge_out->new_value = temp_value; + + return true; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque<Slice>& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + std::string string_key = key.ToString(); + EXPECT_TRUE(string_key == "a" || string_key == "b"); + + if (string_key == "a") { + EXPECT_EQ(1, operand_list.size()); + } else if (string_key == "b") { + EXPECT_EQ(2, operand_list.size()); + } + + std::string temp_value; + for (auto& operand : operand_list) { + temp_value.append(operand.ToString()); + } + swap(temp_value, *new_value); + + return true; + } + + const char* Name() const override { + return "CompactionIteratorTest SingleMergeOp"; + } + + bool AllowSingleOperand() const override { return true; } + }; + + SingleMergeOp merge_op; + Filter filter; + InitIterators( + // a should invoke PartialMergeMulti with a single merge operand. + {test::KeyStr("a", 50, kTypeMerge), + // b should invoke PartialMergeMulti with two operands. + test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge), + // c should invoke FullMerge due to kTypeValue at the beginning. + test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)}, + {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, &merge_op, &filter); + + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("av1", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ("bv1bv2", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); +} + +// In bottommost level, values earlier than earliest snapshot can be output +// with sequence = 0. +TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// In bottommost level, deletions earlier than earliest snapshot can be removed +// permanently. +TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// In bottommost level, single deletions earlier than earliest snapshot can be +// removed permanently. +TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion)}, + {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""}, + kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, + testing::Values(true, false)); + +class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest { + public: + bool SupportsPerKeyPlacement() const override { return true; } +}; + +TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) { + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + latest_cold_seq = 5; + + InitIterators( + {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue), + test::KeyStr("c", 5, kTypeValue)}, + {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + // the first 2 keys are hot, which should has + // `output_to_penultimate_level()==true` and seq num not zeroed out + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + // `a` is cold data, which should be output to bottommost + ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString()); + ASSERT_FALSE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) { + AddSnapshot(5); + + InitIterators( + {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion), + test::KeyStr("b", 5, kTypeValue)}, + {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + // The first key and the tombstone are within snapshot, which should output + // to the penultimate level (and seq num cannot be zeroed out). + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + // `a` is not protected by the snapshot, the sequence number is zero out and + // should output bottommost + ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString()); + ASSERT_FALSE(c_iter_->output_to_penultimate_level()); + c_iter_->Next(); + ASSERT_OK(c_iter_->status()); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) { + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + latest_cold_seq = 6; + + AddSnapshot(5); + + InitIterators({test::KeyStr("a", 7, kTypeValue), + test::KeyStr("unsafe_pb", 6, kTypeValue), + test::KeyStr("c", 5, kTypeValue)}, + {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, nullptr, nullptr, true); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + + ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); + ASSERT_TRUE(c_iter_->output_to_penultimate_level()); + // the 2nd key is unsafe to output_to_penultimate_level, but it's within + // snapshot so for per_key_placement feature it has to be outputted to the + // penultimate level. which is a corruption. We should never see + // such case as the data with seq num (within snapshot) should always come + // from higher compaction input level, which makes it safe to + // output_to_penultimate_level. + c_iter_->Next(); + ASSERT_TRUE(c_iter_->status().IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest, + PerKeyPlacementCompIteratorTest, + testing::Values(true, false)); + +// Tests how CompactionIterator work together with SnapshotChecker. +class CompactionIteratorWithSnapshotCheckerTest + : public CompactionIteratorTest { + public: + bool UseSnapshotChecker() const override { return true; } +}; + +// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is +// while committed version of these keys should get compacted as usual. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Value) { + RunTest( + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Deletion) { + RunTest({test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Merge) { + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_SingleDelete) { + RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_BlobIndex) { + RunTest({test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +// Test compaction iterator dedup keys visible to the same snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) { + AddSnapshot(2, 1); + AddSnapshot(4, 3); + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + DedupSameSnapshot_SingleDeletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeSingleDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +// At bottom level, sequence numbers can be zero out, and deletions can be +// removed, but only when they are visible to earliest snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), + test::KeyStr("c", 3, kTypeDeletion)}, + {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfValuePresentToEarlierSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", "", ""}, + {test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// Single delete should not cancel out values that not visible to the +// same set of snapshots +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + SingleDeleteAcrossSnapshotBoundary) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, 2 /*last_committed_seq*/); +} + +// Single delete should be kept in case it is not visible to the +// earliest write conflict snapshot. If a single delete is kept for this reason, +// corresponding value can be trimmed to save space. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + +// Same as above but with a blob index. In addition to the value getting +// trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_BlobIndex) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeBlobIndex)}, + {"", "fake_blob_index"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + +// Same as above but with a wide-column entity. In addition to the value getting +// trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_WideColumnEntity) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeWideColumnEntity)}, + {"", "fake_entity"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */, + nullptr /* compaction_filter */, false /* bottommost_level */, + 2 /* earliest_write_conflict_snapshot */); +} + +// Compaction filter should keep uncommitted key as-is, and +// * Convert the latest value to deletion, and/or +// * if latest value is a merge, apply filter to all subsequent merges. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)}, + {"v2", "v1", "v3", "v4"}, + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)}, + {"v2", "", "v3", ""}, 1 /*last_committed_seq*/, + nullptr /*merge_operator*/, compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) { + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeDeletion), + test::KeyStr("a", 1, kTypeDeletion)}, + {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/, + compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + CompactionFilter_PartialMerge) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeMerge)}, + {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"}, + 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr<CompactionFilter> compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)}, + {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(), + compaction_filter.get()); +} + +// Tests how CompactionIterator work together with AllowIngestBehind. +class CompactionIteratorWithAllowIngestBehindTest + : public CompactionIteratorTest { + public: + bool AllowIngestBehind() const override { return true; } +}; + +// When allow_ingest_behind is set, compaction iterator is not targeting +// the bottommost level since there is no guarantee there won't be further +// data ingested under the compaction output in future. +TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +TEST_P(CompactionIteratorWithAllowIngestBehindTest, + MergeToPutIfEncounteredPutAtBottom) { + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance, + CompactionIteratorWithAllowIngestBehindTest, + testing::Values(true, false)); + +class CompactionIteratorTsGcTest : public CompactionIteratorTest { + public: + CompactionIteratorTsGcTest() + : CompactionIteratorTest(test::BytewiseComparatorWithU64TsWrapper()) {} +}; + +TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector<std::string> input_values = {"a3", "", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector<std::string>& expected_keys = input_keys; + const std::vector<std::string>& expected_values = input_values; + const std::vector<std::pair<bool, bool>> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair<bool, bool>& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) { + constexpr char user_key[] = "a"; + const std::vector<std::string> input_keys = { + test::KeyStr(10002, user_key, 102, kTypeMerge), + test::KeyStr(10001, user_key, 101, kTypeMerge), + test::KeyStr(10000, user_key, 100, kTypeValue)}; + const std::vector<std::string> input_values = {"2", "1", "a0"}; + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendTESTOperator(); + const std::vector<std::string>& expected_keys = input_keys; + const std::vector<std::string>& expected_values = input_values; + const std::vector<std::pair<bool, bool>> params = { + {false, false}, {false, true}, {true, true}}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, + /*full_history_ts_low=*/nullptr); + } +} + +TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a2", "a1", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector<std::string> expected_keys = { + input_keys[0], input_keys[1], input_keys[3]}; + const std::vector<std::string> expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot, the deletion marker should be preserved because the user + // key may appear beyond output level. + const std::vector<std::string> expected_keys = {input_keys[0], + input_keys[3]}; + const std::vector<std::string> expected_values = {"", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } + { + // No snapshot, the deletion marker can be dropped because the user key + // does not appear in higher levels. + const std::vector<std::string> expected_keys = {input_keys[3]}; + const std::vector<std::string> expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) { + constexpr char user_key[][2] = {"a", "f"}; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge), + test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge), + test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge), + test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge), + test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge), + test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600, + kTypeDeletionWithTimestamp)}; + const std::vector<std::string> input_values = {"25", "19", "18", "16", + "19", "17", ""}; + std::shared_ptr<MergeOperator> merge_op = + MergeOperators::CreateStringAppendTESTOperator(); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 20000); + + const std::vector<std::pair<bool, bool>> params = { + {false, false}, {false, true}, {true, true}}; + + { + AddSnapshot(1600); + AddSnapshot(1900); + const std::vector<std::string> expected_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge), + test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge), + test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge), + test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge), + test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge), + test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600, + kTypeDeletionWithTimestamp)}; + const std::vector<std::string> expected_values = {"25", "19", "18", "16", + "19", "17", ""}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + auto expected_keys_copy = expected_keys; + auto expected_values_copy = expected_values; + if (bottommost_level || key_not_exists_beyond_output_level) { + // the kTypeDeletionWithTimestamp will be dropped + expected_keys_copy.pop_back(); + expected_values_copy.pop_back(); + if (bottommost_level) { + // seq zero + expected_keys_copy[3] = + test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue); + } + } + RunTest(input_keys, input_values, expected_keys_copy, + expected_values_copy, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + ClearSnapshots(); + } + + // No snapshots + { + const std::vector<std::string> expected_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)}; + const std::vector<std::string> expected_values = {"16,18,19,25", "17,19"}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + auto expected_keys_copy = expected_keys; + auto expected_values_copy = expected_values; + if (bottommost_level) { + expected_keys_copy[1] = + test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue); + } + RunTest(input_keys, input_values, expected_keys_copy, + expected_values_copy, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + } +} + +TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) { + constexpr char user_key[] = "a"; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a2", "a1", "a0"}; + { + std::string full_history_ts_low; + // Keys whose timestamps larger than or equal to 102 will be preserved. + PutFixed64(&full_history_ts_low, 102); + const std::vector<std::string> expected_keys = { + input_keys[0], input_keys[1], input_keys[2]}; + const std::vector<std::string> expected_values = {"", input_values[1], + input_values[2]}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, DropTombstones) { + constexpr char user_key[] = "a"; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a2", "", "a0"}; + const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]}; + const std::vector<std::string> expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + + { + // Non-bottommost level, but key does not exist beyond output level. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_sequence=*/kMaxSequenceNumber, + /*merge_op=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } + { + // Bottommost level + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, RewriteTs) { + constexpr char user_key[] = "a"; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a2", "", "a0"}; + const std::vector<std::string> expected_keys = { + input_keys[0], input_keys[1], input_keys[2], + test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)}; + const std::vector<std::string> expected_values = {"", "a2", "", "a0"}; + + AddSnapshot(1); + AddSnapshot(2); + + { + // Bottommost level and need to rewrite both ts and seq. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a3", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector<std::string>& expected_keys = input_keys; + const std::vector<std::string>& expected_values = input_values; + const std::vector<std::pair<bool, bool>> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair<bool, bool>& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) { + constexpr char user_key[] = "a"; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a2", "", "a0"}; + const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]}; + const std::vector<std::string> expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + { + const std::vector<std::pair<bool, bool>> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair<bool, bool>& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector<std::string> input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector<std::string> input_values = {"", "a2", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector<std::string> expected_keys = { + input_keys[0], input_keys[1], input_keys[2]}; + const std::vector<std::string> expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot. + const std::vector<std::string> expected_keys = {input_keys[2]}; + const std::vector<std::string> expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance, + CompactionIteratorTsGcTest, + testing::Values(true, false)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc new file mode 100644 index 000000000..1da1bcda8 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.cc @@ -0,0 +1,2060 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" + +#include <algorithm> +#include <cinttypes> +#include <memory> +#include <optional> +#include <set> +#include <utility> +#include <vector> + +#include "db/blob/blob_counting_iterator.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_builder.h" +#include "db/builder.h" +#include "db/compaction/clipping_iterator.h" +#include "db/compaction/compaction_state.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/history_trimming_iterator.h" +#include "db/log_writer.h" +#include "db/merge_helper.h" +#include "db/range_del_aggregator.h" +#include "db/version_edit.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetCompactionReasonString(CompactionReason compaction_reason) { + switch (compaction_reason) { + case CompactionReason::kUnknown: + return "Unknown"; + case CompactionReason::kLevelL0FilesNum: + return "LevelL0FilesNum"; + case CompactionReason::kLevelMaxLevelSize: + return "LevelMaxLevelSize"; + case CompactionReason::kUniversalSizeAmplification: + return "UniversalSizeAmplification"; + case CompactionReason::kUniversalSizeRatio: + return "UniversalSizeRatio"; + case CompactionReason::kUniversalSortedRunNum: + return "UniversalSortedRunNum"; + case CompactionReason::kFIFOMaxSize: + return "FIFOMaxSize"; + case CompactionReason::kFIFOReduceNumFiles: + return "FIFOReduceNumFiles"; + case CompactionReason::kFIFOTtl: + return "FIFOTtl"; + case CompactionReason::kManualCompaction: + return "ManualCompaction"; + case CompactionReason::kFilesMarkedForCompaction: + return "FilesMarkedForCompaction"; + case CompactionReason::kBottommostFiles: + return "BottommostFiles"; + case CompactionReason::kTtl: + return "Ttl"; + case CompactionReason::kFlush: + return "Flush"; + case CompactionReason::kExternalSstIngestion: + return "ExternalSstIngestion"; + case CompactionReason::kPeriodicCompaction: + return "PeriodicCompaction"; + case CompactionReason::kChangeTemperature: + return "ChangeTemperature"; + case CompactionReason::kForcedBlobGC: + return "ForcedBlobGC"; + case CompactionReason::kRoundRobinTtl: + return "RoundRobinTtl"; + case CompactionReason::kNumOfReasons: + // fall through + default: + assert(false); + return "Invalid"; + } +} + +const char* GetCompactionPenultimateOutputRangeTypeString( + Compaction::PenultimateOutputRangeType range_type) { + switch (range_type) { + case Compaction::PenultimateOutputRangeType::kNotSupported: + return "NotSupported"; + case Compaction::PenultimateOutputRangeType::kFullRange: + return "FullRange"; + case Compaction::PenultimateOutputRangeType::kNonLastRange: + return "NonLastRange"; + case Compaction::PenultimateOutputRangeType::kDisabled: + return "Disabled"; + default: + assert(false); + return "Invalid"; + } +} + +CompactionJob::CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic<bool>* shutting_down, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_directory, FSDirectory* blob_output_directory, + Statistics* stats, InstrumentedMutex* db_mutex, + ErrorHandler* db_error_handler, + std::vector<SequenceNumber> existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, JobContext* job_context, + std::shared_ptr<Cache> table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, + CompactionJobStats* compaction_job_stats, Env::Priority thread_pri, + const std::shared_ptr<IOTracer>& io_tracer, + const std::atomic<bool>& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, std::string trim_ts, + BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled, + int* bg_bottom_compaction_scheduled) + : compact_(new CompactionState(compaction)), + compaction_stats_(compaction->compaction_reason(), 1), + db_options_(db_options), + mutable_db_options_copy_(mutable_db_options), + log_buffer_(log_buffer), + output_directory_(output_directory), + stats_(stats), + bottommost_level_(false), + write_hint_(Env::WLTH_NOT_SET), + compaction_job_stats_(compaction_job_stats), + job_id_(job_id), + dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), + file_options_(file_options), + env_(db_options.env), + io_tracer_(io_tracer), + fs_(db_options.fs, io_tracer), + file_options_for_read_( + fs_->OptimizeForCompactionTableRead(file_options, db_options_)), + versions_(versions), + shutting_down_(shutting_down), + manual_compaction_canceled_(manual_compaction_canceled), + db_directory_(db_directory), + blob_output_directory_(blob_output_directory), + db_mutex_(db_mutex), + db_error_handler_(db_error_handler), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + job_context_(job_context), + table_cache_(std::move(table_cache)), + event_logger_(event_logger), + paranoid_file_checks_(paranoid_file_checks), + measure_io_stats_(measure_io_stats), + thread_pri_(thread_pri), + full_history_ts_low_(std::move(full_history_ts_low)), + trim_ts_(std::move(trim_ts)), + blob_callback_(blob_callback), + extra_num_subcompaction_threads_reserved_(0), + bg_compaction_scheduled_(bg_compaction_scheduled), + bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) { + assert(compaction_job_stats_ != nullptr); + assert(log_buffer_ != nullptr); + + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + ReportStartedCompaction(compaction); +} + +CompactionJob::~CompactionJob() { + assert(compact_ == nullptr); + ThreadStatusUtil::ResetThreadStatus(); +} + +void CompactionJob::ReportStartedCompaction(Compaction* compaction) { + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_id_); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, + (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) + + compact_->compaction->output_level()); + + // In the current design, a CompactionJob is always created + // for non-trivial compaction. + assert(compaction->IsTrivialMove() == false || + compaction->is_manual_compaction() == true); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_PROP_FLAGS, + compaction->is_manual_compaction() + + (compaction->deletion_compaction() << 1)); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, + compaction->CalculateTotalInputSize()); + + IOSTATS_RESET(bytes_written); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, 0); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, 0); + + // Set the thread operation after operation properties + // to ensure GetThreadList() can always show them all together. + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats_->is_manual_compaction = + compaction->is_manual_compaction(); + compaction_job_stats_->is_full_compaction = compaction->is_full_compaction(); +} + +void CompactionJob::Prepare() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PREPARE); + + // Generate file_levels_ for compaction before making Iterator + auto* c = compact_->compaction; + ColumnFamilyData* cfd = c->column_family_data(); + assert(cfd != nullptr); + assert(cfd->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = cfd->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + if (c->ShouldFormSubcompactions()) { + StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); + GenSubcompactionBoundaries(); + } + if (boundaries_.size() > 1) { + for (size_t i = 0; i <= boundaries_.size(); i++) { + compact_->sub_compact_states.emplace_back( + c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt, + (i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i]) + : std::nullopt, + static_cast<uint32_t>(i)); + // assert to validate that boundaries don't have same user keys (without + // timestamp part). + assert(i == 0 || i == boundaries_.size() || + cfd->user_comparator()->CompareWithoutTimestamp( + boundaries_[i - 1], boundaries_[i]) < 0); + } + RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, + compact_->sub_compact_states.size()); + } else { + compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt, + /*sub_job_id*/ 0); + } + + // collect all seqno->time information from the input files which will be used + // to encode seqno->time to the output files. + uint64_t preserve_time_duration = + std::max(c->immutable_options()->preserve_internal_time_seconds, + c->immutable_options()->preclude_last_level_data_seconds); + + if (preserve_time_duration > 0) { + // setup seqno_time_mapping_ + seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration); + for (const auto& each_level : *c->inputs()) { + for (const auto& fmd : each_level.files) { + std::shared_ptr<const TableProperties> tp; + Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); + if (s.ok()) { + seqno_time_mapping_.Add(tp->seqno_to_time_mapping) + .PermitUncheckedError(); + seqno_time_mapping_.Add(fmd->fd.smallest_seqno, + fmd->oldest_ancester_time); + } + } + } + + auto status = seqno_time_mapping_.Sort(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Invalid sequence number to time mapping: Status: %s", + status.ToString().c_str()); + } + int64_t _current_time = 0; + status = db_options_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time in compaction: Status: %s", + status.ToString().c_str()); + // preserve all time information + preserve_time_min_seqno_ = 0; + preclude_last_level_min_seqno_ = 0; + } else { + seqno_time_mapping_.TruncateOldEntries(_current_time); + uint64_t preserve_time = + static_cast<uint64_t>(_current_time) > preserve_time_duration + ? _current_time - preserve_time_duration + : 0; + preserve_time_min_seqno_ = + seqno_time_mapping_.GetOldestSequenceNum(preserve_time); + if (c->immutable_options()->preclude_last_level_data_seconds > 0) { + uint64_t preclude_last_level_time = + static_cast<uint64_t>(_current_time) > + c->immutable_options()->preclude_last_level_data_seconds + ? _current_time - + c->immutable_options()->preclude_last_level_data_seconds + : 0; + preclude_last_level_min_seqno_ = + seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time); + } + } + } +} + +uint64_t CompactionJob::GetSubcompactionsLimit() { + return extra_num_subcompaction_threads_reserved_ + + std::max( + std::uint64_t(1), + static_cast<uint64_t>(compact_->compaction->max_subcompactions())); +} + +void CompactionJob::AcquireSubcompactionResources( + int num_extra_required_subcompactions) { + TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0"); + TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1"); + int max_db_compactions = + DBImpl::GetBGJobLimits( + mutable_db_options_copy_.max_background_flushes, + mutable_db_options_copy_.max_background_compactions, + mutable_db_options_copy_.max_background_jobs, + versions_->GetColumnFamilySet() + ->write_controller() + ->NeedSpeedupCompaction()) + .max_compactions; + InstrumentedMutexLock l(db_mutex_); + // Apply min function first since We need to compute the extra subcompaction + // against compaction limits. And then try to reserve threads for extra + // subcompactions. The actual number of reserved threads could be less than + // the desired number. + int available_bg_compactions_against_db_limit = + std::max(max_db_compactions - *bg_compaction_scheduled_ - + *bg_bottom_compaction_scheduled_, + 0); + // Reservation only supports backgrdoun threads of which the priority is + // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the + // origin thread_pri_ is higher than that. Similar to ReleaseThreads(). + extra_num_subcompaction_threads_reserved_ = + env_->ReserveThreads(std::min(num_extra_required_subcompactions, + available_bg_compactions_against_db_limit), + std::min(thread_pri_, Env::Priority::HIGH)); + + // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_ + // depending on if this compaction has the bottommost priority + if (thread_pri_ == Env::Priority::BOTTOM) { + *bg_bottom_compaction_scheduled_ += + extra_num_subcompaction_threads_reserved_; + } else { + *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_; + } +} + +void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) { + // Do nothing when we have zero resources to shrink + if (num_extra_resources == 0) return; + db_mutex_->Lock(); + // We cannot release threads more than what we reserved before + int extra_num_subcompaction_threads_released = env_->ReleaseThreads( + (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH)); + // Update the number of reserved threads and the number of background + // scheduled compactions for this compaction job + extra_num_subcompaction_threads_reserved_ -= + extra_num_subcompaction_threads_released; + // TODO (zichen): design a test case with new subcompaction partitioning + // when the number of actual partitions is less than the number of planned + // partitions + assert(extra_num_subcompaction_threads_released == (int)num_extra_resources); + // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_ + // depending on if this compaction has the bottommost priority + if (thread_pri_ == Env::Priority::BOTTOM) { + *bg_bottom_compaction_scheduled_ -= + extra_num_subcompaction_threads_released; + } else { + *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released; + } + db_mutex_->Unlock(); + TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0"); +} + +void CompactionJob::ReleaseSubcompactionResources() { + if (extra_num_subcompaction_threads_reserved_ == 0) { + return; + } + { + InstrumentedMutexLock l(db_mutex_); + // The number of reserved threads becomes larger than 0 only if the + // compaction prioity is round robin and there is no sufficient + // sub-compactions available + + // The scheduled compaction must be no less than 1 + extra number + // subcompactions using acquired resources since this compaction job has not + // finished yet + assert(*bg_bottom_compaction_scheduled_ >= + 1 + extra_num_subcompaction_threads_reserved_ || + *bg_compaction_scheduled_ >= + 1 + extra_num_subcompaction_threads_reserved_); + } + ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_); +} + +struct RangeWithSize { + Range range; + uint64_t size; + + RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0) + : range(a, b), size(s) {} +}; + +void CompactionJob::GenSubcompactionBoundaries() { + // The goal is to find some boundary keys so that we can evenly partition + // the compaction input data into max_subcompactions ranges. + // For every input file, we ask TableReader to estimate 128 anchor points + // that evenly partition the input file into 128 ranges and the range + // sizes. This can be calculated by scanning index blocks of the file. + // Once we have the anchor points for all the input files, we merge them + // together and try to find keys dividing ranges evenly. + // For example, if we have two input files, and each returns following + // ranges: + // File1: (a1, 1000), (b1, 1200), (c1, 1100) + // File2: (a2, 1100), (b2, 1000), (c2, 1000) + // We total sort the keys to following: + // (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000) + // We calculate the total size by adding up all ranges' size, which is 6400. + // If we would like to partition into 2 subcompactions, the target of the + // range size is 3200. Based on the size, we take "b1" as the partition key + // since the first three ranges would hit 3200. + // + // Note that the ranges are actually overlapping. For example, in the example + // above, the range ending with "b1" is overlapping with the range ending with + // "b2". So the size 1000+1100+1200 is an underestimation of data size up to + // "b1". In extreme cases where we only compact N L0 files, a range can + // overlap with N-1 other ranges. Since we requested a relatively large number + // (128) of ranges from each input files, even N range overlapping would + // cause relatively small inaccuracy. + + auto* c = compact_->compaction; + if (c->max_subcompactions() <= 1 && + !(c->immutable_options()->compaction_pri == kRoundRobin && + c->immutable_options()->compaction_style == kCompactionStyleLevel)) { + return; + } + auto* cfd = c->column_family_data(); + const Comparator* cfd_comparator = cfd->user_comparator(); + const InternalKeyComparator& icomp = cfd->internal_comparator(); + + auto* v = compact_->compaction->input_version(); + int base_level = v->storage_info()->base_level(); + InstrumentedMutexUnlock unlock_guard(db_mutex_); + + uint64_t total_size = 0; + std::vector<TableReader::Anchor> all_anchors; + int start_lvl = c->start_level(); + int out_lvl = c->output_level(); + + for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) { + int lvl = c->level(lvl_idx); + if (lvl >= start_lvl && lvl <= out_lvl) { + const LevelFilesBrief* flevel = c->input_levels(lvl_idx); + size_t num_files = flevel->num_files; + + if (num_files == 0) { + continue; + } + + for (size_t i = 0; i < num_files; i++) { + FileMetaData* f = flevel->files[i].file_metadata; + std::vector<TableReader::Anchor> my_anchors; + Status s = cfd->table_cache()->ApproximateKeyAnchors( + ReadOptions(), icomp, *f, my_anchors); + if (!s.ok() || my_anchors.empty()) { + my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); + } + for (auto& ac : my_anchors) { + // Can be optimize to avoid this loop. + total_size += ac.range_size; + } + + all_anchors.insert(all_anchors.end(), my_anchors.begin(), + my_anchors.end()); + } + } + } + // Here we total sort all the anchor points across all files and go through + // them in the sorted order to find partitioning boundaries. + // Not the most efficient implementation. A much more efficient algorithm + // probably exists. But they are more complex. If performance turns out to + // be a problem, we can optimize. + std::sort( + all_anchors.begin(), all_anchors.end(), + [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool { + return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) < + 0; + }); + + // Remove duplicated entries from boundaries. + all_anchors.erase( + std::unique(all_anchors.begin(), all_anchors.end(), + [cfd_comparator](TableReader::Anchor& a, + TableReader::Anchor& b) -> bool { + return cfd_comparator->CompareWithoutTimestamp( + a.user_key, b.user_key) == 0; + }), + all_anchors.end()); + + // Get the number of planned subcompactions, may update reserve threads + // and update extra_num_subcompaction_threads_reserved_ for round-robin + uint64_t num_planned_subcompactions; + if (c->immutable_options()->compaction_pri == kRoundRobin && + c->immutable_options()->compaction_style == kCompactionStyleLevel) { + // For round-robin compaction prioity, we need to employ more + // subcompactions (may exceed the max_subcompaction limit). The extra + // subcompactions will be executed using reserved threads and taken into + // account bg_compaction_scheduled or bg_bottom_compaction_scheduled. + + // Initialized by the number of input files + num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0)); + uint64_t max_subcompactions_limit = GetSubcompactionsLimit(); + if (max_subcompactions_limit < num_planned_subcompactions) { + // Assert two pointers are not empty so that we can use extra + // subcompactions against db compaction limits + assert(bg_bottom_compaction_scheduled_ != nullptr); + assert(bg_compaction_scheduled_ != nullptr); + // Reserve resources when max_subcompaction is not sufficient + AcquireSubcompactionResources( + (int)(num_planned_subcompactions - max_subcompactions_limit)); + // Subcompactions limit changes after acquiring additional resources. + // Need to call GetSubcompactionsLimit() again to update the number + // of planned subcompactions + num_planned_subcompactions = + std::min(num_planned_subcompactions, GetSubcompactionsLimit()); + } else { + num_planned_subcompactions = max_subcompactions_limit; + } + } else { + num_planned_subcompactions = GetSubcompactionsLimit(); + } + + TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0", + &num_planned_subcompactions); + if (num_planned_subcompactions == 1) return; + + // Group the ranges into subcompactions + uint64_t target_range_size = std::max( + total_size / num_planned_subcompactions, + MaxFileSizeForLevel( + *(c->mutable_cf_options()), out_lvl, + c->immutable_options()->compaction_style, base_level, + c->immutable_options()->level_compaction_dynamic_level_bytes)); + + if (target_range_size >= total_size) { + return; + } + + uint64_t next_threshold = target_range_size; + uint64_t cumulative_size = 0; + uint64_t num_actual_subcompactions = 1U; + for (TableReader::Anchor& anchor : all_anchors) { + cumulative_size += anchor.range_size; + if (cumulative_size > next_threshold) { + next_threshold += target_range_size; + num_actual_subcompactions++; + boundaries_.push_back(anchor.user_key); + } + if (num_actual_subcompactions == num_planned_subcompactions) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1", + &num_actual_subcompactions); + // Shrink extra subcompactions resources when extra resrouces are acquired + ShrinkSubcompactionResources( + std::min((int)(num_planned_subcompactions - num_actual_subcompactions), + extra_num_subcompaction_threads_reserved_)); +} + +Status CompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + TEST_SYNC_POINT("CompactionJob::Run():Start"); + log_buffer_->FlushBufferToLog(); + LogCompaction(); + + const size_t num_threads = compact_->sub_compact_states.size(); + assert(num_threads > 0); + const uint64_t start_micros = db_options_.clock->NowMicros(); + + // Launch a thread for each of subcompactions 1...num_threads-1 + std::vector<port::Thread> thread_pool; + thread_pool.reserve(num_threads - 1); + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this, + &compact_->sub_compact_states[i]); + } + + // Always schedule the first subcompaction (whether or not there are also + // others) in the current thread to be efficient with resources + ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + + // Wait for all other threads (if there are any) to finish execution + for (auto& thread : thread_pool) { + thread.join(); + } + + compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); + + for (auto& state : compact_->sub_compact_states) { + compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros); + state.RemoveLastEmptyOutput(); + } + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + + // Check if any thread encountered an error during execution + Status status; + IOStatus io_s; + bool wrote_new_blob_files = false; + + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + io_s = state.io_status; + break; + } + + if (state.Current().HasBlobFileAdditions()) { + wrote_new_blob_files = true; + } + } + + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + + if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && + blob_output_directory_ != output_directory_) { + io_s = blob_output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + thread_pool.clear(); + std::vector<const CompactionOutputs::Output*> files_output; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + files_output.emplace_back(&output); + } + } + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + auto& prefix_extractor = + compact_->compaction->mutable_cf_options()->prefix_extractor; + std::atomic<size_t> next_file_idx(0); + auto verify_table = [&](Status& output_status) { + while (true) { + size_t file_idx = next_file_idx.fetch_add(1); + if (file_idx >= files_output.size()) { + break; + } + // Verify that the table is usable + // We set for_compaction to false and don't + // OptimizeForCompactionTableRead here because this is a special case + // after we finish the table building No matter whether + // use_direct_io_for_flush_and_compaction is true, we will regard this + // verification as user reads since the goal is to cache it here for + // further user reads + ReadOptions read_options; + InternalIterator* iter = cfd->table_cache()->NewIterator( + read_options, file_options_, cfd->internal_comparator(), + files_output[file_idx]->meta, /*range_del_agg=*/nullptr, + prefix_extractor, + /*table_reader_ptr=*/nullptr, + cfd->internal_stats()->GetFileReadHist( + compact_->compaction->output_level()), + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + MaxFileSizeForL0MetaPin( + *compact_->compaction->mutable_cf_options()), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); + auto s = iter->status(); + + if (s.ok() && paranoid_file_checks_) { + OutputValidator validator(cfd->internal_comparator(), + /*_enable_order_check=*/true, + /*_enable_hash=*/true); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = validator.Add(iter->key(), iter->value()); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + s = iter->status(); + } + if (s.ok() && + !validator.CompareValidator(files_output[file_idx]->validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } + } + + delete iter; + + if (!s.ok()) { + output_status = s; + break; + } + } + }; + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back( + verify_table, std::ref(compact_->sub_compact_states[i].status)); + } + verify_table(compact_->sub_compact_states[0].status); + for (auto& thread : thread_pool) { + thread.join(); + } + + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + } + + ReleaseSubcompactionResources(); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0"); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1"); + + TablePropertiesCollection tp; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + auto fn = + TableFileName(state.compaction->immutable_options()->cf_paths, + output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); + tp[fn] = output.table_properties; + } + } + compact_->compaction->SetOutputTableProperties(std::move(tp)); + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + + RecordCompactionIOStats(); + LogFlush(db_options_.info_log); + TEST_SYNC_POINT("CompactionJob::Run():End"); + + compact_->status = status; + return status; +} + +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + assert(compact_); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_INSTALL); + db_mutex_->AssertHeld(); + Status status = compact_->status; + + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + assert(cfd); + + int output_level = compact_->compaction->output_level(); + cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_, + compaction_stats_); + + if (status.ok()) { + status = InstallCompactionResults(mutable_cf_options); + } + if (!versions_->io_status().ok()) { + io_status_ = versions_->io_status(); + } + + VersionStorageInfo::LevelSummaryStorage tmp; + auto vstorage = cfd->current()->storage_info(); + const auto& stats = compaction_stats_.stats; + + double read_write_amp = 0.0; + double write_amp = 0.0; + double bytes_read_per_sec = 0; + double bytes_written_per_sec = 0; + + const uint64_t bytes_read_non_output_and_blob = + stats.bytes_read_non_output_levels + stats.bytes_read_blob; + const uint64_t bytes_read_all = + stats.bytes_read_output_level + bytes_read_non_output_and_blob; + const uint64_t bytes_written_all = + stats.bytes_written + stats.bytes_written_blob; + + if (bytes_read_non_output_and_blob > 0) { + read_write_amp = (bytes_written_all + bytes_read_all) / + static_cast<double>(bytes_read_non_output_and_blob); + write_amp = + bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob); + } + if (stats.micros > 0) { + bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros); + bytes_written_per_sec = + bytes_written_all / static_cast<double>(stats.micros); + } + + const std::string& column_family_name = cfd->GetName(); + + constexpr double kMB = 1048576.0; + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d +%d blob) " + "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), " + "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64 + ", records dropped: %" PRIu64 " output_compression: %s\n", + column_family_name.c_str(), vstorage->LevelSummary(&tmp), + bytes_read_per_sec, bytes_written_per_sec, + compact_->compaction->output_level(), + stats.num_input_files_in_non_output_levels, + stats.num_input_files_in_output_level, stats.num_output_files, + stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB, + stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB, + stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp, + write_amp, status.ToString().c_str(), stats.num_input_records, + stats.num_dropped_records, + CompressionTypeToString(compact_->compaction->output_compression()) + .c_str()); + + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + assert(blob_files.front()); + assert(blob_files.back()); + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(), + blob_files.back()->GetBlobFileNumber()); + } + + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] has Penultimate Level output: %" PRIu64 + ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64, + column_family_name.c_str(), + compaction_stats_.penultimate_level_stats.bytes_written, + compact_->compaction->GetPenultimateLevel(), + compaction_stats_.penultimate_level_stats.num_output_files, + compaction_stats_.penultimate_level_stats.num_output_records); + } + + UpdateCompactionJobStats(stats); + + auto stream = event_logger_->LogToBuffer(log_buffer_, 8192); + stream << "job" << job_id_ << "event" + << "compaction_finished" + << "compaction_time_micros" << stats.micros + << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" + << compact_->compaction->output_level() << "num_output_files" + << stats.num_output_files << "total_output_size" + << stats.bytes_written; + + if (stats.num_output_files_blob > 0) { + stream << "num_blob_output_files" << stats.num_output_files_blob + << "total_blob_output_size" << stats.bytes_written_blob; + } + + stream << "num_input_records" << stats.num_input_records + << "num_output_records" << stats.num_output_records + << "num_subcompactions" << compact_->sub_compact_states.size() + << "output_compression" + << CompressionTypeToString(compact_->compaction->output_compression()); + + stream << "num_single_delete_mismatches" + << compaction_job_stats_->num_single_del_mismatch; + stream << "num_single_delete_fallthrough" + << compaction_job_stats_->num_single_del_fallthru; + + if (measure_io_stats_) { + stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; + stream << "file_range_sync_nanos" + << compaction_job_stats_->file_range_sync_nanos; + stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos; + stream << "file_prepare_write_nanos" + << compaction_job_stats_->file_prepare_write_nanos; + } + + stream << "lsm_state"; + stream.StartArray(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + if (!blob_files.empty()) { + assert(blob_files.front()); + stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber(); + + assert(blob_files.back()); + stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); + } + + if (compaction_stats_.has_penultimate_level_output) { + InternalStats::CompactionStats& pl_stats = + compaction_stats_.penultimate_level_stats; + stream << "penultimate_level_num_output_files" << pl_stats.num_output_files; + stream << "penultimate_level_bytes_written" << pl_stats.bytes_written; + stream << "penultimate_level_num_output_records" + << pl_stats.num_output_records; + stream << "penultimate_level_num_output_files_blob" + << pl_stats.num_output_files_blob; + stream << "penultimate_level_bytes_written_blob" + << pl_stats.bytes_written_blob; + } + + CleanupCompaction(); + return status; +} + +void CompactionJob::NotifyOnSubcompactionBegin( + SubcompactionState* sub_compact) { +#ifndef ROCKSDB_LITE + Compaction* c = compact_->compaction; + + if (db_options_.listeners.empty()) { + return; + } + if (shutting_down_->load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_canceled_.load(std::memory_order_acquire)) { + return; + } + + sub_compact->notify_on_subcompaction_completion = true; + + SubcompactionJobInfo info{}; + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast<int>(job_id_); + info.thread_id = env_->GetThreadID(); + + for (const auto& listener : db_options_.listeners) { + listener->OnSubcompactionBegin(info); + } + info.status.PermitUncheckedError(); + +#else + (void)sub_compact; +#endif // ROCKSDB_LITE +} + +void CompactionJob::NotifyOnSubcompactionCompleted( + SubcompactionState* sub_compact) { +#ifndef ROCKSDB_LITE + + if (db_options_.listeners.empty()) { + return; + } + if (shutting_down_->load(std::memory_order_acquire)) { + return; + } + + if (sub_compact->notify_on_subcompaction_completion == false) { + return; + } + + SubcompactionJobInfo info{}; + sub_compact->BuildSubcompactionJobInfo(info); + info.job_id = static_cast<int>(job_id_); + info.thread_id = env_->GetThreadID(); + + for (const auto& listener : db_options_.listeners) { + listener->OnSubcompactionCompleted(info); + } +#else + (void)sub_compact; +#endif // ROCKSDB_LITE +} + +void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + +#ifndef ROCKSDB_LITE + if (db_options_.compaction_service) { + CompactionServiceJobStatus comp_status = + ProcessKeyValueCompactionWithCompactionService(sub_compact); + if (comp_status == CompactionServiceJobStatus::kSuccess || + comp_status == CompactionServiceJobStatus::kFailure) { + return; + } + // fallback to local compaction + assert(comp_status == CompactionServiceJobStatus::kUseLocal); + } +#endif // !ROCKSDB_LITE + + uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + + // Create compaction filter and fail the compaction if + // IgnoreSnapshots() = false because it is not supported anymore + const CompactionFilter* compaction_filter = + cfd->ioptions()->compaction_filter; + std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr; + if (compaction_filter == nullptr) { + compaction_filter_from_factory = + sub_compact->compaction->CreateCompactionFilter(); + compaction_filter = compaction_filter_from_factory.get(); + } + if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) { + sub_compact->status = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return; + } + + NotifyOnSubcompactionBegin(sub_compact); + + auto range_del_agg = std::make_unique<CompactionRangeDelAggregator>( + &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_, + &trim_ts_); + + // TODO: since we already use C++17, should use + // std::optional<const Slice> instead. + const std::optional<Slice> start = sub_compact->start; + const std::optional<Slice> end = sub_compact->end; + + std::optional<Slice> start_without_ts; + std::optional<Slice> end_without_ts; + + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.fill_cache = false; + read_options.rate_limiter_priority = GetRateLimiterPriority(); + // Compaction iterators shouldn't be confined to a single prefix. + // Compactions use Seek() for + // (a) concurrent compactions, + // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. + read_options.total_order_seek = true; + + // Remove the timestamps from boundaries because boundaries created in + // GenSubcompactionBoundaries doesn't strip away the timestamp. + size_t ts_sz = cfd->user_comparator()->timestamp_size(); + if (start.has_value()) { + read_options.iterate_lower_bound = &start.value(); + if (ts_sz > 0) { + start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz); + read_options.iterate_lower_bound = &start_without_ts.value(); + } + } + if (end.has_value()) { + read_options.iterate_upper_bound = &end.value(); + if (ts_sz > 0) { + end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz); + read_options.iterate_upper_bound = &end_without_ts.value(); + } + } + + // Although the v2 aggregator is what the level iterator(s) know about, + // the AddTombstones calls will be propagated down to the v1 aggregator. + std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator( + read_options, sub_compact->compaction, range_del_agg.get(), + file_options_for_read_, start, end)); + InternalIterator* input = raw_input.get(); + + IterKey start_ikey; + IterKey end_ikey; + Slice start_slice; + Slice end_slice; + + static constexpr char kMaxTs[] = + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + Slice ts_slice; + std::string max_ts; + if (ts_sz > 0) { + if (ts_sz <= strlen(kMaxTs)) { + ts_slice = Slice(kMaxTs, ts_sz); + } else { + max_ts = std::string(ts_sz, '\xff'); + ts_slice = Slice(max_ts); + } + } + + if (start.has_value()) { + start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber, + kValueTypeForSeek); + if (ts_sz > 0) { + start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, + &ts_slice); + } + start_slice = start_ikey.GetInternalKey(); + } + if (end.has_value()) { + end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek); + if (ts_sz > 0) { + end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, + &ts_slice); + } + end_slice = end_ikey.GetInternalKey(); + } + + std::unique_ptr<InternalIterator> clip; + if (start.has_value() || end.has_value()) { + clip = std::make_unique<ClippingIterator>( + raw_input.get(), start.has_value() ? &start_slice : nullptr, + end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator()); + input = clip.get(); + } + + std::unique_ptr<InternalIterator> blob_counter; + + if (sub_compact->compaction->DoesInputReferenceBlobFiles()) { + BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter(); + blob_counter = std::make_unique<BlobCountingIterator>(input, meter); + input = blob_counter.get(); + } + + std::unique_ptr<InternalIterator> trim_history_iter; + if (ts_sz > 0 && !trim_ts_.empty()) { + trim_history_iter = std::make_unique<HistoryTrimmingIterator>( + input, cfd->user_comparator(), trim_ts_); + input = trim_history_iter.get(); + } + + input->SeekToFirst(); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PROCESS_KV); + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + const uint64_t kRecordStatsEvery = 1000; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + MergeHelper merge( + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(), + compaction_filter, db_options_.info_log.get(), + false /* internal key corruption is expected */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_, compact_->compaction->level(), db_options_.stats); + + const MutableCFOptions* mutable_cf_options = + sub_compact->compaction->mutable_cf_options(); + assert(mutable_cf_options); + + std::vector<std::string> blob_file_paths; + + // TODO: BlobDB to support output_to_penultimate_level compaction, which needs + // 2 builders, so may need to move to `CompactionOutputs` + std::unique_ptr<BlobFileBuilder> blob_file_builder( + (mutable_cf_options->enable_blob_files && + sub_compact->compaction->output_level() >= + mutable_cf_options->blob_file_starting_level) + ? new BlobFileBuilder( + versions_, fs_.get(), + sub_compact->compaction->immutable_options(), + mutable_cf_options, &file_options_, db_id_, db_session_id_, + job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW, + write_hint_, io_tracer_, blob_callback_, + BlobFileCreationReason::kCompaction, &blob_file_paths, + sub_compact->Current().GetBlobFileAdditionsPtr()) + : nullptr); + + TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:1", + reinterpret_cast<void*>( + const_cast<std::atomic<bool>*>(&manual_compaction_canceled_))); + + const std::string* const full_history_ts_low = + full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; + const SequenceNumber job_snapshot_seq = + job_context_ ? job_context_->GetJobSnapshotSequence() + : kMaxSequenceNumber; + + auto c_iter = std::make_unique<CompactionIterator>( + input, cfd->user_comparator(), &merge, versions_->LastSequence(), + &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq, + snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), + /*expect_valid_internal_key=*/true, range_del_agg.get(), + blob_file_builder.get(), db_options_.allow_data_in_errors, + db_options_.enforce_single_del_contracts, manual_compaction_canceled_, + sub_compact->compaction, compaction_filter, shutting_down_, + db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_, + preclude_last_level_min_seqno_); + c_iter->SeekToFirst(); + + // Assign range delete aggregator to the target output level, which makes sure + // it only output to single level + sub_compact->AssignRangeDelAggregator(std::move(range_del_agg)); + + const auto& c_iter_stats = c_iter->iter_stats(); + + // define the open and close functions for the compaction files, which will be + // used open/close output files when needed. + const CompactionFileOpenFunc open_file_func = + [this, sub_compact](CompactionOutputs& outputs) { + return this->OpenCompactionOutputFile(sub_compact, outputs); + }; + const CompactionFileCloseFunc close_file_func = + [this, sub_compact](CompactionOutputs& outputs, const Status& status, + const Slice& next_table_min_key) { + return this->FinishCompactionOutputFile(status, sub_compact, outputs, + next_table_min_key); + }; + + Status status; + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::ProcessKeyValueCompaction()::Processing", + reinterpret_cast<void*>( + const_cast<Compaction*>(sub_compact->compaction))); + while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { + // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() + // returns true. + + assert(!end.has_value() || cfd->user_comparator()->Compare( + c_iter->user_key(), end.value()) < 0); + + if (c_iter_stats.num_input_records % kRecordStatsEvery == + kRecordStatsEvery - 1) { + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + c_iter->ResetRecordCounts(); + RecordCompactionIOStats(); + } + + // Add current compaction_iterator key to target compaction output, if the + // output file needs to be close or open, it will call the `open_file_func` + // and `close_file_func`. + // TODO: it would be better to have the compaction file open/close moved + // into `CompactionOutputs` which has the output file information. + status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func); + if (!status.ok()) { + break; + } + + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:2", + reinterpret_cast<void*>( + const_cast<std::atomic<bool>*>(&manual_compaction_canceled_))); + c_iter->Next(); + if (c_iter->status().IsManualCompactionPaused()) { + break; + } + } + + sub_compact->compaction_job_stats.num_blobs_read = + c_iter_stats.num_blobs_read; + sub_compact->compaction_job_stats.total_blob_bytes_read = + c_iter_stats.total_blob_bytes_read; + sub_compact->compaction_job_stats.num_input_deletion_records = + c_iter_stats.num_input_deletion_records; + sub_compact->compaction_job_stats.num_corrupt_keys = + c_iter_stats.num_input_corrupt_records; + sub_compact->compaction_job_stats.num_single_del_fallthru = + c_iter_stats.num_single_del_fallthru; + sub_compact->compaction_job_stats.num_single_del_mismatch = + c_iter_stats.num_single_del_mismatch; + sub_compact->compaction_job_stats.total_input_raw_key_bytes += + c_iter_stats.total_input_raw_key_bytes; + sub_compact->compaction_job_stats.total_input_raw_value_bytes += + c_iter_stats.total_input_raw_value_bytes; + + RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, + c_iter_stats.total_filter_time); + + if (c_iter_stats.num_blobs_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED, + c_iter_stats.num_blobs_relocated); + } + if (c_iter_stats.total_blob_bytes_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED, + c_iter_stats.total_blob_bytes_relocated); + } + + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + RecordCompactionIOStats(); + + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + (manual_compaction_canceled_.load(std::memory_order_relaxed))) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (status.ok()) { + status = input->status(); + } + if (status.ok()) { + status = c_iter->status(); + } + + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output files. Open file function is also passed, in case there's + // only range-dels, no file was opened, to save the range-dels, it need to + // create a new output file. + status = sub_compact->CloseCompactionFiles(status, open_file_func, + close_file_func); + + if (blob_file_builder) { + if (status.ok()) { + status = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(status); + } + blob_file_builder.reset(); + sub_compact->Current().UpdateBlobStats(); + } + + sub_compact->compaction_job_stats.cpu_micros = + db_options_.clock->CPUMicros() - prev_cpu_micros; + + if (measure_io_stats_) { + sub_compact->compaction_job_stats.file_write_nanos += + IOSTATS(write_nanos) - prev_write_nanos; + sub_compact->compaction_job_stats.file_fsync_nanos += + IOSTATS(fsync_nanos) - prev_fsync_nanos; + sub_compact->compaction_job_stats.file_range_sync_nanos += + IOSTATS(range_sync_nanos) - prev_range_sync_nanos; + sub_compact->compaction_job_stats.file_prepare_write_nanos += + IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; + sub_compact->compaction_job_stats.cpu_micros -= + (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos + + IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) / + 1000; + if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { + SetPerfLevel(prev_perf_level); + } + } +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!status.ok()) { + if (c_iter) { + c_iter->status().PermitUncheckedError(); + } + if (input) { + input->status().PermitUncheckedError(); + } + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + + blob_counter.reset(); + clip.reset(); + raw_input.reset(); + sub_compact->status = status; + NotifyOnSubcompactionCompleted(sub_compact); +} + +uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const { + return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id; +} + +void CompactionJob::RecordDroppedKeys( + const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats) { + if (c_iter_stats.num_record_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, + c_iter_stats.num_record_drop_user); + } + if (c_iter_stats.num_record_drop_hidden > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + c_iter_stats.num_record_drop_hidden); + if (compaction_job_stats) { + compaction_job_stats->num_records_replaced += + c_iter_stats.num_record_drop_hidden; + } + } + if (c_iter_stats.num_record_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, + c_iter_stats.num_record_drop_obsolete); + if (compaction_job_stats) { + compaction_job_stats->num_expired_deletion_records += + c_iter_stats.num_record_drop_obsolete; + } + } + if (c_iter_stats.num_record_drop_range_del > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL, + c_iter_stats.num_record_drop_range_del); + } + if (c_iter_stats.num_range_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE, + c_iter_stats.num_range_del_drop_obsolete); + } + if (c_iter_stats.num_optimized_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + c_iter_stats.num_optimized_del_drop_obsolete); + } +} + +Status CompactionJob::FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionOutputs& outputs, const Slice& next_table_min_key) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_SYNC_FILE); + assert(sub_compact != nullptr); + assert(outputs.HasBuilder()); + + FileMetaData* meta = outputs.GetMetaData(); + uint64_t output_number = meta->fd.GetNumber(); + assert(output_number != 0); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + + // Check for iterator errors + Status s = input_status; + + // Add range tombstones + auto earliest_snapshot = kMaxSequenceNumber; + if (existing_snapshots_.size() > 0) { + earliest_snapshot = existing_snapshots_[0]; + } + if (s.ok()) { + CompactionIterationStats range_del_out_stats; + // if the compaction supports per_key_placement, only output range dels to + // the penultimate level. + // Note: Use `bottommost_level_ = true` for both bottommost and + // output_to_penultimate_level compaction here, as it's only used to decide + // if range dels could be dropped. + if (outputs.HasRangeDel()) { + s = outputs.AddRangeDels( + sub_compact->start.has_value() ? &(sub_compact->start.value()) + : nullptr, + sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr, + range_del_out_stats, bottommost_level_, cfd->internal_comparator(), + earliest_snapshot, next_table_min_key, full_history_ts_low_); + } + RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); + TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); + } + + const uint64_t current_entries = outputs.NumEntries(); + + s = outputs.Finish(s, seqno_time_mapping_); + + if (s.ok()) { + // With accurate smallest and largest key, we can get a slightly more + // accurate oldest ancester time. + // This makes oldest ancester time in manifest more accurate than in + // table properties. Not sure how to resolve it. + if (meta->smallest.size() > 0 && meta->largest.size() > 0) { + uint64_t refined_oldest_ancester_time; + Slice new_smallest = meta->smallest.user_key(); + Slice new_largest = meta->largest.user_key(); + if (!new_largest.empty() && !new_smallest.empty()) { + refined_oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + &(meta->smallest), &(meta->largest)); + if (refined_oldest_ancester_time != + std::numeric_limits<uint64_t>::max()) { + meta->oldest_ancester_time = refined_oldest_ancester_time; + } + } + } + } + + // Finish and check for file errors + IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_, + db_options_.use_fsync); + + if (s.ok() && io_s.ok()) { + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + } + + if (s.ok()) { + s = io_s; + } + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the + // "normal" status, it does not also need to be checked + sub_compact->io_status.PermitUncheckedError(); + } + + TableProperties tp; + if (s.ok()) { + tp = outputs.GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { + // If there is nothing to output, no necessary to generate a sst file. + // This happens when the output level is bottom level, at the same time + // the sub_compact output nothing. + std::string fname = + TableFileName(sub_compact->compaction->immutable_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + + // TODO(AR) it is not clear if there are any larger implications if + // DeleteFile fails here + Status ds = env_->DeleteFile(fname); + if (!ds.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64 + " at bottom level%s", + cfd->GetName().c_str(), job_id_, output_number, + meta->marked_for_compaction ? " (need compaction)" : ""); + } + + // Also need to remove the file from outputs, or it will be added to the + // VersionEdit. + outputs.RemoveLastOutput(); + meta = nullptr; + } + + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { + // Output to event logger and fire events. + outputs.UpdateTableProperties(); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 + " keys, %" PRIu64 " bytes%s, temperature: %s", + cfd->GetName().c_str(), job_id_, output_number, + current_entries, meta->fd.file_size, + meta->marked_for_compaction ? " (need compaction)" : "", + temperature_to_string[meta->temperature].c_str()); + } + std::string fname; + FileDescriptor output_fd; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + Status status_for_listener = s; + if (meta != nullptr) { + fname = GetTableFileName(meta->fd.GetNumber()); + output_fd = meta->fd; + oldest_blob_file_number = meta->oldest_blob_file_number; + } else { + fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } + } + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, + job_id_, output_fd, oldest_blob_file_number, tp, + TableFileCreationReason::kCompaction, status_for_listener, file_checksum, + file_checksum_func_name); + +#ifndef ROCKSDB_LITE + // Report new file to SstFileManagerImpl + auto sfm = + static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get()); + if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) { + Status add_s = sfm->OnAddFile(fname); + if (!add_s.ok() && s.ok()) { + s = add_s; + } + if (sfm->IsMaxAllowedSpaceReached()) { + // TODO(ajkr): should we return OK() if max space was reached by the final + // compaction output file (similarly to how flush works when full)? + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached"); + InstrumentedMutexLock l(db_mutex_); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); + } + } +#endif + + outputs.ResetBuilder(); + return s; +} + +Status CompactionJob::InstallCompactionResults( + const MutableCFOptions& mutable_cf_options) { + assert(compact_); + + db_mutex_->AssertHeld(); + + auto* compaction = compact_->compaction; + assert(compaction); + + { + Compaction::InputLevelSummaryBuffer inputs_summary; + if (compaction_stats_.has_penultimate_level_output) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64 + " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.penultimate_level_stats.bytes_written, + compaction_stats_.stats.bytes_written, + compaction_stats_.TotalBytesWritten()); + } else { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), + job_id_, compaction->InputLevelSummary(&inputs_summary), + compaction_stats_.TotalBytesWritten()); + } + } + + VersionEdit* const edit = compaction->edit(); + assert(edit); + + // Add compaction inputs + compaction->AddInputDeletions(edit); + + std::unordered_map<uint64_t, BlobGarbageMeter::BlobStats> blob_total_garbage; + + for (const auto& sub_compact : compact_->sub_compact_states) { + sub_compact.AddOutputsEdit(edit); + + for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) { + edit->AddBlobFile(blob); + } + + if (sub_compact.Current().GetBlobGarbageMeter()) { + const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows(); + + for (const auto& pair : flows) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobInOutFlow& flow = pair.second; + + assert(flow.IsValid()); + if (flow.HasGarbage()) { + blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(), + flow.GetGarbageBytes()); + } + } + } + } + + for (const auto& pair : blob_total_garbage) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobStats& stats = pair.second; + + edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(), + stats.GetBytes()); + } + + if ((compaction->compaction_reason() == + CompactionReason::kLevelMaxLevelSize || + compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) && + compaction->immutable_options()->compaction_pri == kRoundRobin) { + int start_level = compaction->start_level(); + if (start_level > 0) { + auto vstorage = compaction->input_version()->storage_info(); + edit->AddCompactCursor(start_level, + vstorage->GetNextCompactCursor( + start_level, compaction->num_input_files(0))); + } + } + + return versions_->LogAndApply(compaction->column_family_data(), + mutable_cf_options, edit, db_mutex_, + db_directory_); +} + +void CompactionJob::RecordCompactionIOStats() { + RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + CompactionReason compaction_reason = + compact_->compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kPeriodicCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kTtl) { + RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written)); + } + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read)); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs) { + assert(sub_compact != nullptr); + + // no need to lock because VersionSet::next_file_number_ is atomic + uint64_t file_number = versions_->NewFileNumber(); + std::string fname = GetTableFileName(file_number); + // Fire events. + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, + TableFileCreationReason::kCompaction); +#endif // !ROCKSDB_LITE + // Make the output file + std::unique_ptr<FSWritableFile> writable_file; +#ifndef NDEBUG + bool syncpoint_arg = file_options_.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", + &syncpoint_arg); +#endif + + // Pass temperature of the last level files to FileSystem. + FileOptions fo_copy = file_options_; + Temperature temperature = sub_compact->compaction->output_temperature(); + // only set for the last level compaction and also it's not output to + // penultimate level (when preclude_last_level feature is enabled) + if (temperature == Temperature::kUnknown && + sub_compact->compaction->is_last_level() && + !sub_compact->IsCurrentPenultimateLevel()) { + temperature = + sub_compact->compaction->mutable_cf_options()->last_level_temperature; + } + fo_copy.temperature = temperature; + + Status s; + IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); + s = io_s; + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the io_s that is checked below as s, + // it does not also need to be checked. + sub_compact->io_status.PermitUncheckedError(); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 + " fails at NewWritableFile with status %s", + sub_compact->compaction->column_family_data()->GetName().c_str(), + job_id_, file_number, s.ToString().c_str()); + LogFlush(db_options_.info_log); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), + fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, + TableProperties(), TableFileCreationReason::kCompaction, s, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + return s; + } + + // Try to figure out the output file's oldest ancester time. + int64_t temp_current_time = 0; + auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!get_time_status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time. Status: %s", + get_time_status.ToString().c_str()); + } + uint64_t current_time = static_cast<uint64_t>(temp_current_time); + InternalKey tmp_start, tmp_end; + if (sub_compact->start.has_value()) { + tmp_start.SetMinPossibleForUserKey(sub_compact->start.value()); + } + if (sub_compact->end.has_value()) { + tmp_end.SetMinPossibleForUserKey(sub_compact->end.value()); + } + uint64_t oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + sub_compact->start.has_value() ? &tmp_start : nullptr, + sub_compact->end.has_value() ? &tmp_end : nullptr); + if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) { + oldest_ancester_time = current_time; + } + + // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + { + FileMetaData meta; + meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + meta.oldest_ancester_time = oldest_ancester_time; + meta.file_creation_time = current_time; + meta.temperature = temperature; + assert(!db_id_.empty()); + assert(!db_session_id_.empty()); + s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(), + &meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] file #%" PRIu64 + " failed to generate unique id: %s.", + cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(), + s.ToString().c_str()); + return s; + } + + outputs.AddOutput(std::move(meta), cfd->internal_comparator(), + sub_compact->compaction->mutable_cf_options() + ->check_flush_compaction_key_order, + paranoid_file_checks_); + } + + writable_file->SetIOPriority(GetRateLimiterPriority()); + writable_file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = db_options_.checksum_handoff_file_types; + writable_file->SetPreallocationBlockSize(static_cast<size_t>( + sub_compact->compaction->OutputFilePreallocationSize())); + const auto& listeners = + sub_compact->compaction->immutable_options()->listeners; + outputs.AssignFileWriter(new WritableFileWriter( + std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_, + db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + TableBuilderOptions tboptions( + *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + sub_compact->compaction->output_compression(), + sub_compact->compaction->output_compression_opts(), cfd->GetID(), + cfd->GetName(), sub_compact->compaction->output_level(), + bottommost_level_, TableFileCreationReason::kCompaction, + 0 /* oldest_key_time */, current_time, db_id_, db_session_id_, + sub_compact->compaction->max_output_file_size(), file_number); + + outputs.NewBuilder(tboptions); + + LogFlush(db_options_.info_log); + return s; +} + +void CompactionJob::CleanupCompaction() { + for (SubcompactionState& sub_compact : compact_->sub_compact_states) { + sub_compact.Cleanup(table_cache_.get()); + } + delete compact_; + compact_ = nullptr; +} + +#ifndef ROCKSDB_LITE +namespace { +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} +} // namespace + +#endif // !ROCKSDB_LITE + +void CompactionJob::UpdateCompactionStats() { + assert(compact_); + + Compaction* compaction = compact_->compaction; + compaction_stats_.stats.num_input_files_in_non_output_levels = 0; + compaction_stats_.stats.num_input_files_in_output_level = 0; + for (int input_level = 0; + input_level < static_cast<int>(compaction->num_input_levels()); + ++input_level) { + if (compaction->level(input_level) != compaction->output_level()) { + UpdateCompactionInputStatsHelper( + &compaction_stats_.stats.num_input_files_in_non_output_levels, + &compaction_stats_.stats.bytes_read_non_output_levels, input_level); + } else { + UpdateCompactionInputStatsHelper( + &compaction_stats_.stats.num_input_files_in_output_level, + &compaction_stats_.stats.bytes_read_output_level, input_level); + } + } + + assert(compaction_job_stats_); + compaction_stats_.stats.bytes_read_blob = + compaction_job_stats_->total_blob_bytes_read; + + compaction_stats_.stats.num_dropped_records = + compaction_stats_.DroppedRecords(); +} + +void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, + uint64_t* bytes_read, + int input_level) { + const Compaction* compaction = compact_->compaction; + auto num_input_files = compaction->num_input_files(input_level); + *num_files += static_cast<int>(num_input_files); + + for (size_t i = 0; i < num_input_files; ++i) { + const auto* file_meta = compaction->input(input_level, i); + *bytes_read += file_meta->fd.GetFileSize(); + compaction_stats_.stats.num_input_records += + static_cast<uint64_t>(file_meta->num_entries); + } +} + +void CompactionJob::UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const { +#ifndef ROCKSDB_LITE + compaction_job_stats_->elapsed_micros = stats.micros; + + // input information + compaction_job_stats_->total_input_bytes = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + compaction_job_stats_->num_input_records = stats.num_input_records; + compaction_job_stats_->num_input_files = + stats.num_input_files_in_non_output_levels + + stats.num_input_files_in_output_level; + compaction_job_stats_->num_input_files_at_output_level = + stats.num_input_files_in_output_level; + + // output information + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; + compaction_job_stats_->num_output_records = stats.num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; + + if (stats.num_output_files > 0) { + CopyPrefix(compact_->SmallestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->smallest_output_key_prefix); + CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->largest_output_key_prefix); + } +#else + (void)stats; +#endif // !ROCKSDB_LITE +} + +void CompactionJob::LogCompaction() { + Compaction* compaction = compact_->compaction; + ColumnFamilyData* cfd = compaction->column_family_data(); + + // Let's check if anything will get logged. Don't prepare all the info if + // we're not logging + if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f", + cfd->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compaction->score()); + char scratch[2345]; + compaction->Summary(scratch, sizeof(scratch)); + ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + // build event logger report + auto stream = event_logger_->Log(); + stream << "job" << job_id_ << "event" + << "compaction_started" + << "compaction_reason" + << GetCompactionReasonString(compaction->compaction_reason()); + for (size_t i = 0; i < compaction->num_input_levels(); ++i) { + stream << ("files_L" + std::to_string(compaction->level(i))); + stream.StartArray(); + for (auto f : *compaction->inputs(i)) { + stream << f->fd.GetNumber(); + } + stream.EndArray(); + } + stream << "score" << compaction->score() << "input_data_size" + << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno" + << (existing_snapshots_.empty() + ? int64_t{-1} // Use -1 for "none" + : static_cast<int64_t>(existing_snapshots_[0])); + if (compaction->SupportsPerKeyPlacement()) { + stream << "preclude_last_level_min_seqno" + << preclude_last_level_min_seqno_; + stream << "penultimate_output_level" << compaction->GetPenultimateLevel(); + stream << "penultimate_output_range" + << GetCompactionPenultimateOutputRangeTypeString( + compaction->GetPenultimateOutputRangeType()); + + if (compaction->GetPenultimateOutputRangeType() == + Compaction::PenultimateOutputRangeType::kDisabled) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Penultimate level output is disabled, likely " + "because of the range conflict in the penultimate level", + cfd->GetName().c_str(), job_id_); + } + } + } +} + +std::string CompactionJob::GetTableFileName(uint64_t file_number) { + return TableFileName(compact_->compaction->immutable_options()->cf_paths, + file_number, compact_->compaction->output_path_id()); +} + +Env::IOPriority CompactionJob::GetRateLimiterPriority() { + if (versions_ && versions_->GetColumnFamilySet() && + versions_->GetColumnFamilySet()->write_controller()) { + WriteController* write_controller = + versions_->GetColumnFamilySet()->write_controller(); + if (write_controller->NeedsDelay() || write_controller->IsStopped()) { + return Env::IO_USER; + } + } + + return Env::IO_LOW; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h new file mode 100644 index 000000000..bfbce1011 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.h @@ -0,0 +1,500 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <atomic> +#include <deque> +#include <functional> +#include <limits> +#include <set> +#include <string> +#include <utility> +#include <vector> + +#include "db/blob/blob_file_completion_callback.h" +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/memtable_list.h" +#include "db/range_del_aggregator.h" +#include "db/seqno_to_time_mapping.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class CompactionState; +class ErrorHandler; +class MemTable; +class SnapshotChecker; +class SystemClock; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +class SubcompactionState; + +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. +// +// CompactionJob has 2 main stats: +// 1. CompactionJobStats compaction_job_stats_ +// CompactionJobStats is a public data structure which is part of Compaction +// event listener that rocksdb share the job stats with the user. +// Internally it's an aggregation of all the compaction_job_stats from each +// `SubcompactionState`: +// +------------------------+ +// | SubcompactionState | +// | | +// +--------->| compaction_job_stats | +// | | | +// | +------------------------+ +// +------------------------+ | +// | CompactionJob | | +------------------------+ +// | | | | SubcompactionState | +// | compaction_job_stats +-----+ | | +// | | +--------->| compaction_job_stats | +// | | | | | +// +------------------------+ | +------------------------+ +// | +// | +------------------------+ +// | | SubcompactionState | +// | | | +// +--------->+ compaction_job_stats | +// | | | +// | +------------------------+ +// | +// | +------------------------+ +// | | ... | +// +--------->+ | +// +------------------------+ +// +// 2. CompactionStatsFull compaction_stats_ +// `CompactionStatsFull` is an internal stats about the compaction, which +// is eventually sent to `ColumnFamilyData::internal_stats_` and used for +// logging and public metrics. +// Internally, it's an aggregation of stats_ from each `SubcompactionState`. +// It has 2 parts, normal stats about the main compaction information and +// the penultimate level output stats. +// `SubcompactionState` maintains the CompactionOutputs for normal output and +// the penultimate level output if exists, the per_level stats is +// stored with the outputs. +// +---------------------------+ +// | SubcompactionState | +// | | +// | +----------------------+ | +// | | CompactionOutputs | | +// | | (normal output) | | +// +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// +--------------------------------+ | | | CompactionOutputs | | +// | CompactionJob | | | | (penultimate_level) | | +// | | +--------->| stats_ | | +// | compaction_stats_ | | | | +----------------------+ | +// | +-------------------------+ | | | | | +// | |stats (normal) |------|----+ +---------------------------+ +// | +-------------------------+ | | | +// | | | | +// | +-------------------------+ | | | +---------------------------+ +// | |penultimate_level_stats +------+ | | SubcompactionState | +// | +-------------------------+ | | | | | +// | | | | | +----------------------+ | +// | | | | | | CompactionOutputs | | +// +--------------------------------+ | | | | (normal output) | | +// | +---->| stats_ | | +// | | +----------------------+ | +// | | | +// | | +----------------------+ | +// | | | CompactionOutputs | | +// | | | (penultimate_level) | | +// +--------->| stats_ | | +// | +----------------------+ | +// | | +// +---------------------------+ + +class CompactionJob { + public: + CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic<bool>* shutting_down, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector<SequenceNumber> existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, JobContext* job_context, + std::shared_ptr<Cache> table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer, + const std::atomic<bool>& manual_compaction_canceled, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", std::string trim_ts = "", + BlobFileCompletionCallback* blob_callback = nullptr, + int* bg_compaction_scheduled = nullptr, + int* bg_bottom_compaction_scheduled = nullptr); + + virtual ~CompactionJob(); + + // no copy/move + CompactionJob(CompactionJob&& job) = delete; + CompactionJob(const CompactionJob& job) = delete; + CompactionJob& operator=(const CompactionJob& job) = delete; + + // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction + void Prepare(); + // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results + Status Run(); + + // REQUIRED: mutex held + // Add compaction input/output to the current version + Status Install(const MutableCFOptions& mutable_cf_options); + + // Return the IO status + IOStatus io_status() const { return io_status_; } + + protected: + void UpdateCompactionStats(); + void LogCompaction(); + virtual void RecordCompactionIOStats(); + void CleanupCompaction(); + + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + + CompactionState* compact_; + InternalStats::CompactionStatsFull compaction_stats_; + const ImmutableDBOptions& db_options_; + const MutableDBOptions mutable_db_options_copy_; + LogBuffer* log_buffer_; + FSDirectory* output_directory_; + Statistics* stats_; + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + + Env::WriteLifeTimeHint write_hint_; + + IOStatus io_status_; + + CompactionJobStats* compaction_job_stats_; + + private: + friend class CompactionJobTestBase; + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. + void GenSubcompactionBoundaries(); + + // Get the number of planned subcompactions based on max_subcompactions and + // extra reserved resources + uint64_t GetSubcompactionsLimit(); + + // Additional reserved threads are reserved and the number is stored in + // extra_num_subcompaction_threads_reserved__. For now, this happens only if + // the compaction priority is round-robin and max_subcompactions is not + // sufficient (extra resources may be needed) + void AcquireSubcompactionResources(int num_extra_required_subcompactions); + + // Additional threads may be reserved during IncreaseSubcompactionResources() + // if num_actual_subcompactions is less than num_planned_subcompactions. + // Additional threads will be released and the bg_compaction_scheduled_ or + // bg_bottom_compaction_scheduled_ will be updated if they are used. + // DB Mutex lock is required. + void ShrinkSubcompactionResources(uint64_t num_extra_resources); + + // Release all reserved threads and update the compaction limits. + void ReleaseSubcompactionResources(); + + CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact); + + // update the thread status for starting a compaction. + void ReportStartedCompaction(Compaction* compaction); + + Status FinishCompactionOutputFile(const Status& input_status, + SubcompactionState* sub_compact, + CompactionOutputs& outputs, + const Slice& next_table_min_key); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); + Status OpenCompactionOutputFile(SubcompactionState* sub_compact, + CompactionOutputs& outputs); + void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const; + void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats = nullptr); + + void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read, + int input_level); + + void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); + + void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact); + + uint32_t job_id_; + + // DBImpl state + const std::string& dbname_; + const std::string db_id_; + const std::string db_session_id_; + const FileOptions file_options_; + + Env* env_; + std::shared_ptr<IOTracer> io_tracer_; + FileSystemPtr fs_; + // env_option optimized for compaction table reads + FileOptions file_options_for_read_; + VersionSet* versions_; + const std::atomic<bool>* shutting_down_; + const std::atomic<bool>& manual_compaction_canceled_; + FSDirectory* db_directory_; + FSDirectory* blob_output_directory_; + InstrumentedMutex* db_mutex_; + ErrorHandler* db_error_handler_; + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector<SequenceNumber> existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot_; + + const SnapshotChecker* const snapshot_checker_; + + JobContext* job_context_; + + std::shared_ptr<Cache> table_cache_; + + EventLogger* event_logger_; + + bool paranoid_file_checks_; + bool measure_io_stats_; + // Stores the Slices that designate the boundaries for each subcompaction + std::vector<std::string> boundaries_; + Env::Priority thread_pri_; + std::string full_history_ts_low_; + std::string trim_ts_; + BlobFileCompletionCallback* blob_callback_; + + uint64_t GetCompactionId(SubcompactionState* sub_compact) const; + // Stores the number of reserved threads in shared env_ for the number of + // extra subcompaction in kRoundRobin compaction priority + int extra_num_subcompaction_threads_reserved_; + + // Stores the pointer to bg_compaction_scheduled_, + // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing + // or updating it. + int* bg_compaction_scheduled_; + int* bg_bottom_compaction_scheduled_; + + // Stores the sequence number to time mapping gathered from all input files + // it also collects the smallest_seqno -> oldest_ancester_time from the SST. + SeqnoToTimeMapping seqno_time_mapping_; + + // Minimal sequence number for preserving the time information. The time info + // older than this sequence number won't be preserved after the compaction and + // if it's bottommost compaction, the seq num will be zeroed out. + SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + + // Minimal sequence number to preclude the data from the last level. If the + // key has bigger (newer) sequence number than this, it will be precluded from + // the last level (output to penultimate level). + SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + + // Get table file name in where it's outputting to, which should also be in + // `output_directory_`. + virtual std::string GetTableFileName(uint64_t file_number); + // The rate limiter priority (io_priority) is determined dynamically here. + // The Compaction Read and Write priorities are the same for different + // scenarios, such as write stalled. + Env::IOPriority GetRateLimiterPriority(); +}; + +// CompactionServiceInput is used the pass compaction information between two +// db instances. It contains the information needed to do a compaction. It +// doesn't contain the LSM tree information, which is passed though MANIFEST +// file. +struct CompactionServiceInput { + ColumnFamilyDescriptor column_family; + + DBOptions db_options; + + std::vector<SequenceNumber> snapshots; + + // SST files for compaction, it should already be expended to include all the + // files needed for this compaction, for both input level files and output + // level files. + std::vector<std::string> input_files; + int output_level; + + // db_id is used to generate unique id of sst on the remote compactor + std::string db_id; + + // information for subcompaction + bool has_begin = false; + std::string begin; + bool has_end = false; + std::string end; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceInput* obj); + Status Write(std::string* output); + + // Initialize a dummy ColumnFamilyDescriptor + CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {} + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceInput* other); + bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceOutputFile is the metadata for the output SST file +struct CompactionServiceOutputFile { + std::string file_name; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; + std::string smallest_internal_key; + std::string largest_internal_key; + uint64_t oldest_ancester_time; + uint64_t file_creation_time; + uint64_t paranoid_hash; + bool marked_for_compaction; + UniqueId64x2 unique_id; + + CompactionServiceOutputFile() = default; + CompactionServiceOutputFile( + const std::string& name, SequenceNumber smallest, SequenceNumber largest, + std::string _smallest_internal_key, std::string _largest_internal_key, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _paranoid_hash, bool _marked_for_compaction, + UniqueId64x2 _unique_id) + : file_name(name), + smallest_seqno(smallest), + largest_seqno(largest), + smallest_internal_key(std::move(_smallest_internal_key)), + largest_internal_key(std::move(_largest_internal_key)), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + paranoid_hash(_paranoid_hash), + marked_for_compaction(_marked_for_compaction), + unique_id(std::move(_unique_id)) {} +}; + +// CompactionServiceResult contains the compaction result from a different db +// instance, with these information, the primary db instance with write +// permission is able to install the result to the DB. +struct CompactionServiceResult { + Status status; + std::vector<CompactionServiceOutputFile> output_files; + int output_level; + + // location of the output files + std::string output_path; + + // some statistics about the compaction + uint64_t num_output_records = 0; + uint64_t total_bytes = 0; + uint64_t bytes_read = 0; + uint64_t bytes_written = 0; + CompactionJobStats stats; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceResult* obj); + Status Write(std::string* output); + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceResult* other); + bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceCompactionJob is an read-only compaction job, it takes +// input information from `compaction_service_input` and put result information +// in `compaction_service_result`, the SST files are generated to `output_path`. +class CompactionServiceCompactionJob : private CompactionJob { + public: + CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic<bool>* shutting_down, LogBuffer* log_buffer, + FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector<SequenceNumber> existing_snapshots, + std::shared_ptr<Cache> table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer, + const std::atomic<bool>& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result); + + // Run the compaction in current thread and return the result + Status Run(); + + void CleanupCompaction(); + + IOStatus io_status() const { return CompactionJob::io_status(); } + + protected: + void RecordCompactionIOStats() override; + + private: + // Get table file name in output_path + std::string GetTableFileName(uint64_t file_number) override; + // Specific the compaction output path, otherwise it uses default DB path + const std::string output_path_; + + // Compaction job input + const CompactionServiceInput& compaction_input_; + + // Compaction job result + CompactionServiceResult* compaction_result_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc new file mode 100644 index 000000000..930270778 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc @@ -0,0 +1,975 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include <cinttypes> +#include <iostream> +#include <mutex> +#include <queue> +#include <set> +#include <thread> +#include <unordered_set> +#include <utility> + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "monitoring/statistics.h" +#include "monitoring/thread_status_util.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/plain/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/compression.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#if !defined(IOS_CROSS_COMPILE) +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { + +static std::string RandomString(Random* rnd, int len, double ratio) { + std::string r; + test::CompressibleString(rnd, ratio, len, &r); + return r; +} + +std::string Key(uint64_t key, int length) { + const int kBufSize = 1000; + char buf[kBufSize]; + if (length > kBufSize) { + length = kBufSize; + } + snprintf(buf, kBufSize, "%0*" PRIu64, length, key); + return std::string(buf); +} + +class CompactionJobStatsTest : public testing::Test, + public testing::WithParamInterface<bool> { + public: + std::string dbname_; + std::string alternative_wal_dir_; + Env* env_; + DB* db_; + std::vector<ColumnFamilyHandle*> handles_; + uint32_t max_subcompactions_; + + Options last_options_; + + CompactionJobStatsTest() : env_(Env::Default()) { + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::PerThreadDBPath("compaction_job_stats_test"); + alternative_wal_dir_ = dbname_ + "/wal"; + Options options; + options.create_if_missing = true; + max_subcompactions_ = GetParam(); + options.max_subcompactions = max_subcompactions_; + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + Reopen(options); + } + + ~CompactionJobStatsTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + EXPECT_OK(DestroyDB(dbname_, options)); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); } + + void CreateColumnFamilies(const std::vector<std::string>& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector<std::string>& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector<std::string> cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const std::vector<Options>& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const std::vector<Options>& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector<ColumnFamilyDescriptor> column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs, + const Options& options) { + Close(); + std::vector<Options> v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(const Options& options) { + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + } + + Status ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); + } + + Status TryReopen(const Options& options) { + Close(); + last_options_ = options; + return DB::Open(options, dbname_, &db_); + } + + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + return db_->Put(wo, k, v); + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + return db_->Put(wo, handles_[cf], k, v); + } + + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } + + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level, int cf = 0) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level), + &property)); + } + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + Status Size(uint64_t* size, const Slice& start, const Slice& limit, + int cf = 0) { + Range r(start, limit); + if (cf == 0) { + return db_->GetApproximateSizes(&r, 1, size); + } else { + return db_->GetApproximateSizes(handles_[1], &r, 1, size); + } + } + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + CompactRangeOptions compact_options; + compact_options.target_path_id = target_path_id; + ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); + } + + void Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit)); + } + + void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf], + true /* disallow trivial move */)); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + } + } + + static void SetDeletionCompactionStats(CompactionJobStats* stats, + uint64_t input_deletions, + uint64_t expired_deletions, + uint64_t records_replaced) { + stats->num_input_deletion_records = input_deletions; + stats->num_expired_deletion_records = expired_deletions; + stats->num_records_replaced = records_replaced; + } + + void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest, + int key_size, int value_size, uint64_t interval, + double ratio, int cf = 0) { + for (auto key = smallest; key < largest; key += interval) { + ASSERT_OK(Put(cf, Slice(Key(key, key_size)), + Slice(RandomString(rnd, value_size, ratio)))); + } + ASSERT_OK(Flush(cf)); + } + + // This function behaves with the implicit understanding that two + // rounds of keys are inserted into the database, as per the behavior + // of the DeletionStatsTest. + void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest, + uint64_t interval, int deletion_interval, + int key_size, uint64_t cutoff_key_num, + CompactionJobStats* stats, int cf = 0) { + // interval needs to be >= 2 so that deletion entries can be inserted + // that are intended to not result in an actual key deletion by using + // an offset of 1 from another existing key + ASSERT_GE(interval, 2); + + uint64_t ctr = 1; + uint32_t deletions_made = 0; + uint32_t num_deleted = 0; + uint32_t num_expired = 0; + for (auto key = smallest; key <= largest; key += interval, ctr++) { + if (ctr % deletion_interval == 0) { + ASSERT_OK(Delete(cf, Key(key, key_size))); + deletions_made++; + num_deleted++; + + if (key > cutoff_key_num) { + num_expired++; + } + } + } + + // Insert some deletions for keys that don't exist that + // are both in and out of the key range + ASSERT_OK(Delete(cf, Key(smallest + 1, key_size))); + deletions_made++; + + ASSERT_OK(Delete(cf, Key(smallest - 1, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Delete(cf, Key(smallest - 9, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Flush(cf)); + SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted); + } +}; + +// An EventListener which helps verify the compaction results in +// test CompactionJobStatsTest. +class CompactionJobStatsChecker : public EventListener { + public: + CompactionJobStatsChecker() + : compression_enabled_(false), verify_next_comp_io_stats_(false) {} + + size_t NumberOfUnverifiedStats() { return expected_stats_.size(); } + + void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; } + + // Once a compaction completed, this function will verify the returned + // CompactionJobInfo with the oldest CompactionJobInfo added earlier + // in "expected_stats_" which has not yet being used for verification. + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + if (verify_next_comp_io_stats_) { + ASSERT_GT(ci.stats.file_write_nanos, 0); + ASSERT_GT(ci.stats.file_range_sync_nanos, 0); + ASSERT_GT(ci.stats.file_fsync_nanos, 0); + ASSERT_GT(ci.stats.file_prepare_write_nanos, 0); + verify_next_comp_io_stats_ = false; + } + + std::lock_guard<std::mutex> lock(mutex_); + if (expected_stats_.size()) { + Verify(ci.stats, expected_stats_.front()); + expected_stats_.pop(); + } + } + + // A helper function which verifies whether two CompactionJobStats + // match. The verification of all compaction stats are done by + // ASSERT_EQ except for the total input / output bytes, which we + // use ASSERT_GE and ASSERT_LE with a reasonable bias --- + // 10% in uncompressed case and 20% when compression is used. + virtual void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) { + // time + ASSERT_GT(current_stats.elapsed_micros, 0U); + + ASSERT_EQ(current_stats.num_input_records, stats.num_input_records); + ASSERT_EQ(current_stats.num_input_files, stats.num_input_files); + ASSERT_EQ(current_stats.num_input_files_at_output_level, + stats.num_input_files_at_output_level); + + ASSERT_EQ(current_stats.num_output_records, stats.num_output_records); + ASSERT_EQ(current_stats.num_output_files, stats.num_output_files); + + ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction); + ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction); + + // file size + double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10; + ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias), + stats.total_input_bytes); + ASSERT_LE(current_stats.total_input_bytes, + stats.total_input_bytes * (1.00 + kFileSizeBias)); + ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias), + stats.total_output_bytes); + ASSERT_LE(current_stats.total_output_bytes, + stats.total_output_bytes * (1.00 + kFileSizeBias)); + ASSERT_EQ(current_stats.total_input_raw_key_bytes, + stats.total_input_raw_key_bytes); + ASSERT_EQ(current_stats.total_input_raw_value_bytes, + stats.total_input_raw_value_bytes); + + ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys); + + ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix), + std::string(stats.smallest_output_key_prefix)); + ASSERT_EQ(std::string(current_stats.largest_output_key_prefix), + std::string(stats.largest_output_key_prefix)); + } + + // Add an expected compaction stats, which will be used to + // verify the CompactionJobStats returned by the OnCompactionCompleted() + // callback. + void AddExpectedStats(const CompactionJobStats& stats) { + std::lock_guard<std::mutex> lock(mutex_); + expected_stats_.push(stats); + } + + void EnableCompression(bool flag) { compression_enabled_ = flag; } + + bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; } + + private: + std::mutex mutex_; + std::queue<CompactionJobStats> expected_stats_; + bool compression_enabled_; + bool verify_next_comp_io_stats_; +}; + +// An EventListener which helps verify the compaction statistics in +// the test DeletionStatsTest. +class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker { + public: + // Verifies whether two CompactionJobStats match. + void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) override { + ASSERT_EQ(current_stats.num_input_deletion_records, + stats.num_input_deletion_records); + ASSERT_EQ(current_stats.num_expired_deletion_records, + stats.num_expired_deletion_records); + ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys); + } +}; + +namespace { + +uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size, + size_t value_size, double compression_ratio = 1.0, + size_t block_size = 4096, + int bloom_bits_per_key = 10) { + const size_t kPerKeyOverhead = 8; + const size_t kFooterSize = 512; + + uint64_t data_size = static_cast<uint64_t>( + num_records * + (key_size + value_size * compression_ratio + kPerKeyOverhead)); + + return data_size + kFooterSize + + num_records * bloom_bits_per_key / 8 // filter block + + data_size * (key_size + 8) / block_size; // index block +} + +namespace { + +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} + +} // namespace + +CompactionJobStats NewManualCompactionJobStats( + const std::string& smallest_key, const std::string& largest_key, + size_t num_input_files, size_t num_input_files_at_output_level, + uint64_t num_input_records, size_t key_size, size_t value_size, + size_t num_output_files, uint64_t num_output_records, + double compression_ratio, uint64_t num_records_replaced, + bool is_full = false, bool is_manual = true) { + CompactionJobStats stats; + stats.Reset(); + + stats.num_input_records = num_input_records; + stats.num_input_files = num_input_files; + stats.num_input_files_at_output_level = num_input_files_at_output_level; + + stats.num_output_records = num_output_records; + stats.num_output_files = num_output_files; + + stats.total_input_bytes = + EstimatedFileSize(num_input_records / num_input_files, key_size, + value_size, compression_ratio) * + num_input_files; + stats.total_output_bytes = + EstimatedFileSize(num_output_records / num_output_files, key_size, + value_size, compression_ratio) * + num_output_files; + stats.total_input_raw_key_bytes = num_input_records * (key_size + 8); + stats.total_input_raw_value_bytes = num_input_records * value_size; + + stats.is_full_compaction = is_full; + stats.is_manual_compaction = is_manual; + + stats.num_records_replaced = num_records_replaced; + + CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength, + &stats.smallest_output_key_prefix); + CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength, + &stats.largest_output_key_prefix); + + return stats; +} + +CompressionType GetAnyCompression() { + if (Snappy_Supported()) { + return kSnappyCompression; + } else if (Zlib_Supported()) { + return kZlibCompression; + } else if (BZip2_Supported()) { + return kBZip2Compression; + } else if (LZ4_Supported()) { + return kLZ4Compression; + } else if (XPRESS_Supported()) { + return kXpressCompression; + } + + return kNoCompression; +} + +} // namespace + +TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { + Random rnd(301); + const int kBufSize = 100; + char buf[kBufSize]; + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 100; + const int kTestScale = 8; + const int kKeySize = 10; + const int kValueSize = 1000; + const double kCompressionRatio = 0.5; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. The expected CompactionJobStats is added + // via AddExpectedStats(). + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + // just enough setting to hold off auto-compaction. + options.level0_file_num_compaction_trigger = kTestScale + 1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options.bytes_per_sync = 512 * 1024; + + options.report_bg_io_stats = true; + for (int test = 0; test < 2; ++test) { + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // 1st Phase: generate "num_L0_files" L0 files. + int num_L0_files = 0; + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d", ++num_L0_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + ASSERT_EQ(std::to_string(num_L0_files), FilesPerLevel(1)); + + // 2nd Phase: perform L0 -> L1 compaction. + int L0_compaction_count = 6; + int count = 1; + std::string smallest_key; + std::string largest_key; + for (uint64_t start_key = key_base; + start_key <= key_base * L0_compaction_count; + start_key += key_base, count++) { + smallest_key = Key(start_key, 10); + largest_key = Key(start_key + key_base - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize, + kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // compact two files into one in the last L0 -> L1 compaction + int num_remaining_L0 = num_L0_files - L0_compaction_count; + smallest_key = Key(key_base * (L0_compaction_count + 1), 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, num_remaining_L0, 0, + num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1, + num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + + int num_L1_files = num_L0_files - num_remaining_L0 + 1; + num_L0_files = 0; + snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + + // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys) + int sparseness = 2; + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base * sparseness) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize, + kValueSize, key_base * sparseness / num_keys_per_L0_file, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp + // When subcompactions are enabled, the number of output files increases + // by 1 because multiple threads are consuming the input and generating + // output files without coordinating to see if the output could fit into + // a smaller number of files like it does when it runs sequentially + int num_output_files = options.max_subcompactions > 1 ? 2 : 1; + for (uint64_t start_key = key_base; num_L0_files > 1; + start_key += key_base * sparseness) { + smallest_key = Key(start_key, 10); + largest_key = Key(start_key + key_base * sparseness - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize, + kValueSize, num_output_files, + num_keys_per_L0_file * 2, // 1/3 of the data will be updated. + compression_ratio, num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + if (options.max_subcompactions == 1) { + --num_L1_files; + } + snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 5th Phase: Do a full compaction, which involves in two sub-compactions. + // Here we expect to have 1 L0 files and 4 L1 files + // In the first sub-compaction, we expect L0 compaction. + smallest_key = Key(key_base, 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1, + num_keys_per_L0_file * 3, kKeySize, kValueSize, 1, + num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + + num_L1_files = options.max_subcompactions > 1 ? 7 : 4; + char L1_buf[4]; + snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files); + std::string L1_files(L1_buf); + ASSERT_EQ(L1_files, FilesPerLevel(1)); + options.compression = GetAnyCompression(); + if (options.compression == kNoCompression) { + break; + } + stats_checker->EnableCompression(true); + compression_ratio = kCompressionRatio; + + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)), + Slice(RandomString(&rnd, 512 * 1024, 1)))); + } + + ASSERT_OK(Flush(1)); + ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact()); + + stats_checker->set_verify_next_comp_io_stats(true); + std::atomic<bool> first_prepare_write(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) { + if (first_prepare_write.load()) { + options.env->SleepForMicroseconds(3); + first_prepare_write.store(false); + } + }); + + std::atomic<bool> first_flush(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) { + if (first_flush.load()) { + options.env->SleepForMicroseconds(3); + first_flush.store(false); + } + }); + + std::atomic<bool> first_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) { + if (first_sync.load()) { + options.env->SleepForMicroseconds(3); + first_sync.store(false); + } + }); + + std::atomic<bool> first_range_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { + if (first_range_sync.load()) { + options.env->SleepForMicroseconds(3); + first_range_sync.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Compact(1, smallest_key, largest_key); + + ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats()); + ASSERT_TRUE(!first_prepare_write.load()); + ASSERT_TRUE(!first_flush.load()); + ASSERT_TRUE(!first_sync.load()); + ASSERT_TRUE(!first_range_sync.load()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +TEST_P(CompactionJobStatsTest, DeletionStatsTest) { + Random rnd(301); + uint64_t key_base = 100000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 20; + const int kTestScale = 8; // make sure this is even + const int kKeySize = 10; + const int kValueSize = 100; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval; + uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval; + const std::string smallest_key = Key(key_base - 10, kKeySize); + const std::string largest_key = Key(largest_key_num + 10, kKeySize); + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. + auto* stats_checker = new CompactionJobDeletionStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = kTestScale + 1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_bytes_for_level_multiplier = 2; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Stage 1: Generate several L0 files and then send them to L2 by + // using CompactRangeOptions and CompactRange(). These files will + // have a strict subset of the keys from the full key-range + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); + } + + CompactRangeOptions cr_options; + cr_options.change_level = true; + cr_options.target_level = 2; + ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr)); + ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); + + // Stage 2: Generate files including keys from the entire key range + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); + } + + // Send these L0 files to L1 + TEST_Compact(0, 1, smallest_key, largest_key); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + // Add a new record and flush so now there is a L0 file + // with a value too (not just deletions from the next step) + ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test")); + ASSERT_OK(Flush(1)); + + // Stage 3: Generate L0 files with some deletions so now + // there are files with the same key range in L0, L1, and L2 + int deletion_interval = 3; + CompactionJobStats first_compaction_stats; + SelectivelyDeleteKeys(key_base, largest_key_num, key_interval, + deletion_interval, kKeySize, cutoff_key_num, + &first_compaction_stats, 1); + + stats_checker->AddExpectedStats(first_compaction_stats); + + // Stage 4: Trigger compaction and verify the stats + TEST_Compact(0, 1, smallest_key, largest_key); +} + +namespace { +int GetUniversalCompactionInputUnits(uint32_t num_flushes) { + uint32_t compaction_input_units; + for (compaction_input_units = 1; num_flushes >= compaction_input_units; + compaction_input_units *= 2) { + if ((num_flushes & compaction_input_units) != 0) { + return compaction_input_units > 1 ? compaction_input_units : 0; + } + } + return 0; +} +} // namespace + +TEST_P(CompactionJobStatsTest, UniversalCompactionTest) { + Random rnd(301); + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_table = 100; + const uint32_t kTestScale = 6; + const int kKeySize = 10; + const int kValueSize = 900; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_table; + + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.num_levels = 3; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = num_keys_per_table * 1000; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 1; + options.compaction_options_universal.max_size_amplification_percent = 1000; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Generates the expected CompactionJobStats for each compaction + for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) { + // Here we treat one newly flushed file as an unit. + // + // For example, if a newly flushed file is 100k, and a compaction has + // 4 input units, then this compaction inputs 400k. + uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes); + if (num_input_units == 0) { + continue; + } + // A full compaction only happens when the number of flushes equals to + // the number of compaction input runs. + bool is_full = num_flushes == num_input_units; + // The following statement determines the expected smallest key + // based on whether it is a full compaction. + uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1); + + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + Key(smallest_key, 10), + Key(smallest_key + key_base * num_input_units - key_interval, 10), + num_input_units, num_input_units > 2 ? num_input_units / 2 : 0, + num_keys_per_table * num_input_units, kKeySize, kValueSize, + num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full, + false)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U); + + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); + ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact()); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest, + ::testing::Values(1, 4)); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE + +#else + +int main(int /*argc*/, char** /*argv*/) { return 0; } +#endif // !defined(IOS_CROSS_COMPILE) diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc new file mode 100644 index 000000000..c87871100 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_test.cc @@ -0,0 +1,2451 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_job.h" + +#include <algorithm> +#include <array> +#include <cinttypes> +#include <map> +#include <string> +#include <tuple> + +#include "db/blob/blob_index.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/version_set.h" +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "options/options_helper.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "table/unique_id_impl.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void VerifyInitializationOfCompactionJobStats( + const CompactionJobStats& compaction_job_stats) { +#if !defined(IOS_CROSS_COMPILE) + ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_records, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U); + + ASSERT_EQ(compaction_job_stats.num_output_records, 0U); + ASSERT_EQ(compaction_job_stats.num_output_files, 0U); + + ASSERT_EQ(compaction_job_stats.is_manual_compaction, true); + + ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0); + ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0); + + ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U); + ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U); + + ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U); +#endif // !defined(IOS_CROSS_COMPILE) +} + +// Mock FSWritableFile for testing io priority. +// Only override the essential functions for testing compaction io priority. +class MockTestWritableFile : public FSWritableFileOwnerWrapper { + public: + MockTestWritableFile(std::unique_ptr<FSWritableFile>&& file, + Env::IOPriority io_priority) + : FSWritableFileOwnerWrapper(std::move(file)), + write_io_priority_(io_priority) {} + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Append(data, options, dbg); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Append(data, options, verification_info, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Close(options, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Fsync(options, dbg); + } + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->GetFileSize(options, dbg); + } + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->RangeSync(offset, nbytes, options, dbg); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + target()->PrepareWrite(offset, len, options, dbg); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, write_io_priority_); + return target()->Allocate(offset, len, options, dbg); + } + + private: + Env::IOPriority write_io_priority_; +}; + +// Mock FSRandomAccessFile for testing io priority. +// Only override the essential functions for testing compaction io priority. +class MockTestRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + MockTestRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file, + Env::IOPriority io_priority) + : FSRandomAccessFileOwnerWrapper(std::move(file)), + read_io_priority_(io_priority) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + EXPECT_EQ(options.rate_limiter_priority, read_io_priority_); + return target()->Read(offset, n, options, result, scratch, dbg); + } + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + EXPECT_EQ(options.rate_limiter_priority, read_io_priority_); + return target()->Prefetch(offset, n, options, dbg); + } + + private: + Env::IOPriority read_io_priority_; +}; + +// Mock FileSystem for testing io priority. +class MockTestFileSystem : public FileSystemWrapper { + public: + explicit MockTestFileSystem(const std::shared_ptr<FileSystem>& base, + Env::IOPriority read_io_priority, + Env::IOPriority write_io_priority) + : FileSystemWrapper(base), + read_io_priority_(read_io_priority), + write_io_priority_(write_io_priority) {} + + static const char* kClassName() { return "MockTestFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + EXPECT_OK(s); + result->reset( + new MockTestRandomAccessFile(std::move(*result), read_io_priority_)); + return s; + } + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); + EXPECT_OK(s); + result->reset( + new MockTestWritableFile(std::move(*result), write_io_priority_)); + return s; + } + + private: + Env::IOPriority read_io_priority_; + Env::IOPriority write_io_priority_; +}; + +enum TableTypeForTest : uint8_t { kMockTable = 0, kBlockBasedTable = 1 }; + +} // namespace + +class CompactionJobTestBase : public testing::Test { + protected: + CompactionJobTestBase(std::string dbname, const Comparator* ucmp, + std::function<std::string(uint64_t)> encode_u64_ts, + bool test_io_priority, TableTypeForTest table_type) + : dbname_(std::move(dbname)), + ucmp_(ucmp), + db_options_(), + mutable_cf_options_(cf_options_), + mutable_db_options_(), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + versions_(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")), + shutting_down_(false), + mock_table_factory_(new mock::MockTableFactory()), + error_handler_(nullptr, db_options_, &mutex_), + encode_u64_ts_(std::move(encode_u64_ts)), + test_io_priority_(test_io_priority), + table_type_(table_type) { + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + env_ = base_env; + fs_ = env_->GetFileSystem(); + // set default for the tests + mutable_cf_options_.target_file_size_base = 1024 * 1024; + mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024; + } + + void SetUp() override { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits<uint64_t>::max()); + cf_options_.comparator = ucmp_; + if (table_type_ == TableTypeForTest::kBlockBasedTable) { + BlockBasedTableOptions table_options; + cf_options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } else if (table_type_ == TableTypeForTest::kMockTable) { + cf_options_.table_factory = mock_table_factory_; + } else { + assert(false); + } + } + + std::string GenerateFileName(uint64_t file_number) { + FileMetaData meta; + std::vector<DbPath> db_paths; + db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max()); + meta.fd = FileDescriptor(file_number, 0, 0); + return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); + } + + std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num, + const ValueType t, uint64_t ts = 0) { + std::string user_key_with_ts = user_key + encode_u64_ts_(ts); + return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString(); + } + + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset, + uint64_t size, uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset, + size, kNoCompression); + return blob_index; + } + + static std::string BlobStrInlinedTTL(const Slice& value, + uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value); + return blob_index; + } + + // Creates a table with the specificied key value pairs. + void CreateTable(const std::string& table_name, + const mock::KVVector& contents, uint64_t& file_size) { + std::unique_ptr<WritableFileWriter> file_writer; + Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(), + &file_writer, nullptr); + ASSERT_OK(s); + std::unique_ptr<TableBuilder> table_builder( + cf_options_.table_factory->NewTableBuilder( + TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_, + cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), + CompressionType::kNoCompression, + CompressionOptions(), 0 /* column_family_id */, + kDefaultColumnFamilyName, -1 /* level */), + file_writer.get())); + // Build table. + for (auto kv : contents) { + std::string key; + std::string value; + std::tie(key, value) = kv; + table_builder->Add(key, value); + } + ASSERT_OK(table_builder->Finish()); + file_size = table_builder->FileSize(); + } + + void AddMockFile(const mock::KVVector& contents, int level = 0) { + assert(contents.size() > 0); + + bool first_key = true; + std::string smallest, largest; + InternalKey smallest_key, largest_key; + SequenceNumber smallest_seqno = kMaxSequenceNumber; + SequenceNumber largest_seqno = 0; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + for (auto kv : contents) { + ParsedInternalKey key; + std::string skey; + std::string value; + std::tie(skey, value) = kv; + const Status pik_status = + ParseInternalKey(skey, &key, true /* log_err_key */); + + smallest_seqno = std::min(smallest_seqno, key.sequence); + largest_seqno = std::max(largest_seqno, key.sequence); + + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) { + smallest.assign(key.user_key.data(), key.user_key.size()); + smallest_key.DecodeFrom(skey); + } + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, largest) > 0) { + largest.assign(key.user_key.data(), key.user_key.size()); + largest_key.DecodeFrom(skey); + } + + first_key = false; + + if (pik_status.ok() && key.type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + continue; + } + + if (blob_index.IsInlined() || blob_index.HasTTL() || + blob_index.file_number() == kInvalidBlobFileNumber) { + continue; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } + } + + uint64_t file_number = versions_->NewFileNumber(); + + uint64_t file_size = 0; + if (table_type_ == TableTypeForTest::kBlockBasedTable) { + CreateTable(GenerateFileName(file_number), contents, file_size); + } else if (table_type_ == TableTypeForTest::kMockTable) { + file_size = 10; + EXPECT_OK(mock_table_factory_->CreateMockTable( + env_, GenerateFileName(file_number), std::move(contents))); + } else { + assert(false); + } + + VersionEdit edit; + edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key, + smallest_seqno, largest_seqno, false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + + mutex_.Lock(); + EXPECT_OK( + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_, nullptr)); + mutex_.Unlock(); + } + + void VerifyTables(int output_level, + const std::vector<mock::KVVector>& expected_results, + std::vector<uint64_t> expected_oldest_blob_file_numbers) { + if (expected_results.empty()) { + ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); + return; + } + int expected_output_file_num = 0; + for (const auto& e : expected_results) { + if (!e.empty()) { + ++expected_output_file_num; + } + } + ASSERT_EQ(expected_output_file_num, compaction_job_stats_.num_output_files); + if (expected_output_file_num == 0) { + return; + } + + if (expected_oldest_blob_file_numbers.empty()) { + expected_oldest_blob_file_numbers.resize(expected_output_file_num, + kInvalidBlobFileNumber); + } + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + if (table_type_ == TableTypeForTest::kMockTable) { + ASSERT_EQ(compaction_job_stats_.num_output_files, + expected_results.size()); + mock_table_factory_->AssertLatestFiles(expected_results); + } else { + assert(table_type_ == TableTypeForTest::kBlockBasedTable); + } + + auto output_files = + cfd->current()->storage_info()->LevelFiles(output_level); + ASSERT_EQ(expected_output_file_num, output_files.size()); + + if (table_type_ == TableTypeForTest::kMockTable) { + assert(output_files.size() == + static_cast<size_t>(expected_output_file_num)); + const FileMetaData* const output_file = output_files[0]; + ASSERT_EQ(output_file->oldest_blob_file_number, + expected_oldest_blob_file_numbers[0]); + return; + } + + for (size_t i = 0; i < expected_results.size(); ++i) { + const FileMetaData* const output_file = output_files[i]; + std::string file_name = GenerateFileName(output_file->fd.GetNumber()); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr<RandomAccessFileReader> freader; + IOStatus ios = RandomAccessFileReader::Create( + fs, file_name, FileOptions(), &freader, nullptr); + ASSERT_OK(ios); + std::unique_ptr<TableReader> table_reader; + uint64_t file_size = output_file->fd.GetFileSize(); + ReadOptions read_opts; + Status s = cf_options_.table_factory->NewTableReader( + read_opts, + TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(), + cfd_->internal_comparator()), + std::move(freader), file_size, &table_reader, false); + ASSERT_OK(s); + assert(table_reader); + std::unique_ptr<InternalIterator> iiter( + table_reader->NewIterator(read_opts, nullptr, nullptr, true, + TableReaderCaller::kUncategorized)); + assert(iiter); + + mock::KVVector from_db; + for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { + const Slice key = iiter->key(); + const Slice value = iiter->value(); + from_db.emplace_back( + make_pair(key.ToString(false), value.ToString(false))); + } + ASSERT_EQ(expected_results[i], from_db); + } + } + + void SetLastSequence(const SequenceNumber sequence_number) { + versions_->SetLastAllocatedSequence(sequence_number + 1); + versions_->SetLastPublishedSequence(sequence_number + 1); + versions_->SetLastSequence(sequence_number + 1); + } + + // returns expected result after compaction + mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) { + stl_wrappers::KVMap expected_results; + constexpr int kKeysPerFile = 10000; + constexpr int kCorruptKeysPerFile = 200; + constexpr int kMatchingKeys = kKeysPerFile / 2; + SequenceNumber sequence_number = 0; + + auto corrupt_id = [&](int id) { + return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile; + }; + + for (int i = 0; i < 2; ++i) { + auto contents = mock::MakeMockFile(); + for (int k = 0; k < kKeysPerFile; ++k) { + auto key = std::to_string(i * kMatchingKeys + k); + auto value = std::to_string(i * kKeysPerFile + k); + InternalKey internal_key(key, ++sequence_number, kTypeValue); + + // This is how the key will look like once it's written in bottommost + // file + InternalKey bottommost_internal_key(key, 0, kTypeValue); + + if (corrupt_id(k)) { + test::CorruptKeyType(&internal_key); + test::CorruptKeyType(&bottommost_internal_key); + } + contents.push_back({internal_key.Encode().ToString(), value}); + if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) { + expected_results.insert( + {bottommost_internal_key.Encode().ToString(), value}); + } + } + mock::SortKVVector(&contents, ucmp_); + + AddMockFile(contents); + } + + SetLastSequence(sequence_number); + + mock::KVVector expected_results_kvvector; + for (auto& kv : expected_results) { + expected_results_kvvector.push_back({kv.first, kv.second}); + } + + return expected_results_kvvector; + } + + void NewDB() { + EXPECT_OK(DestroyDB(dbname_, Options())); + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + + std::shared_ptr<Logger> info_log; + DBOptions db_opts = BuildDBOptions(db_options_, mutable_db_options_); + Status s = CreateLoggerFromOptions(dbname_, db_opts, &info_log); + ASSERT_OK(s); + db_options_.info_log = info_log; + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ "")); + compaction_job_stats_.Reset(); + ASSERT_OK(SetIdentityFile(env_, dbname_)); + + VersionEdit new_db; + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + std::unique_ptr<WritableFileWriter> file_writer; + const auto& fs = env_->GetFileSystem(); + s = WritableFileWriter::Create(fs, manifest, + fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); + + ASSERT_OK(s); + { + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + + ASSERT_OK(s); + + cf_options_.merge_operator = merge_op_; + cf_options_.compaction_filter = compaction_filter_.get(); + std::vector<ColumnFamilyDescriptor> column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + + ASSERT_OK(versions_->Recover(column_families, false)); + cfd_ = versions_->GetColumnFamilySet()->GetDefault(); + } + + // input_files[i] on input_levels[i] + void RunLastLevelCompaction( + const std::vector<std::vector<FileMetaData*>>& input_files, + const std::vector<int> input_levels, + std::function<void(Compaction& comp)>&& verify_func, + const std::vector<SequenceNumber>& snapshots = {}) { + const int kLastLevel = cf_options_.num_levels - 1; + verify_per_key_placement_ = std::move(verify_func); + mock::KVVector empty_map; + RunCompaction(input_files, input_levels, {empty_map}, snapshots, + kMaxSequenceNumber, kLastLevel, false); + } + + // input_files[i] on input_levels[i] + void RunCompaction( + const std::vector<std::vector<FileMetaData*>>& input_files, + const std::vector<int>& input_levels, + const std::vector<mock::KVVector>& expected_results, + const std::vector<SequenceNumber>& snapshots = {}, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + int output_level = 1, bool verify = true, + std::vector<uint64_t> expected_oldest_blob_file_numbers = {}, + bool check_get_priority = false, + Env::IOPriority read_io_priority = Env::IO_TOTAL, + Env::IOPriority write_io_priority = Env::IO_TOTAL, + int max_subcompactions = 0) { + // For compaction, set fs as MockTestFileSystem to check the io_priority. + if (test_io_priority_) { + db_options_.fs.reset( + new MockTestFileSystem(fs_, read_io_priority, write_io_priority)); + } + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + + size_t num_input_files = 0; + std::vector<CompactionInputFiles> compaction_input_files; + for (size_t i = 0; i < input_files.size(); ++i) { + auto level_files = input_files[i]; + CompactionInputFiles compaction_level; + compaction_level.level = input_levels[i]; + compaction_level.files.insert(compaction_level.files.end(), + level_files.begin(), level_files.end()); + compaction_input_files.push_back(compaction_level); + num_input_files += level_files.size(); + } + + std::vector<FileMetaData*> grandparents; + // it should actually be the next non-empty level + const int kGrandparentsLevel = output_level + 1; + if (kGrandparentsLevel < cf_options_.num_levels) { + grandparents = + cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel); + } + + Compaction compaction( + cfd->current()->storage_info(), *cfd->ioptions(), + *cfd->GetLatestMutableCFOptions(), mutable_db_options_, + compaction_input_files, output_level, + mutable_cf_options_.target_file_size_base, + mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, + cfd->GetLatestMutableCFOptions()->compression_opts, + Temperature::kUnknown, max_subcompactions, grandparents, true); + compaction.SetInputVersion(cfd->current()); + + assert(db_options_.info_log); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + mutex_.Lock(); + EventLogger event_logger(db_options_.info_log.get()); + // TODO(yiwu) add a mock snapshot checker and add test for it. + SnapshotChecker* snapshot_checker = nullptr; + ASSERT_TRUE(full_history_ts_low_.empty() || + ucmp_->timestamp_size() == full_history_ts_low_.size()); + const std::atomic<bool> kManualCompactionCanceledFalse{false}; + CompactionJob compaction_job( + 0, &compaction, db_options_, mutable_db_options_, env_options_, + versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr, + nullptr, nullptr, &mutex_, &error_handler_, snapshots, + earliest_write_conflict_snapshot, snapshot_checker, nullptr, + table_cache_, &event_logger, false, false, dbname_, + &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr), + full_history_ts_low_); + VerifyInitializationOfCompactionJobStats(compaction_job_stats_); + + compaction_job.Prepare(); + mutex_.Unlock(); + Status s = compaction_job.Run(); + ASSERT_OK(s); + ASSERT_OK(compaction_job.io_status()); + mutex_.Lock(); + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); + ASSERT_OK(compaction_job.io_status()); + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + + if (verify) { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + + VerifyTables(output_level, expected_results, + expected_oldest_blob_file_numbers); + } + + if (check_get_priority) { + CheckGetRateLimiterPriority(compaction_job); + } + + if (verify_per_key_placement_) { + // Verify per_key_placement compaction + assert(compaction.SupportsPerKeyPlacement()); + verify_per_key_placement_(compaction); + } + } + + void CheckGetRateLimiterPriority(CompactionJob& compaction_job) { + // When the state from WriteController is normal. + ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW); + + WriteController* write_controller = + compaction_job.versions_->GetColumnFamilySet()->write_controller(); + + { + // When the state from WriteController is Delayed. + std::unique_ptr<WriteControllerToken> delay_token = + write_controller->GetDelayToken(1000000); + ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER); + } + + { + // When the state from WriteController is Stopped. + std::unique_ptr<WriteControllerToken> stop_token = + write_controller->GetStopToken(); + ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER); + } + } + + std::shared_ptr<Env> env_guard_; + Env* env_; + std::shared_ptr<FileSystem> fs_; + std::string dbname_; + const Comparator* const ucmp_; + EnvOptions env_options_; + ImmutableDBOptions db_options_; + ColumnFamilyOptions cf_options_; + MutableCFOptions mutable_cf_options_; + MutableDBOptions mutable_db_options_; + std::shared_ptr<Cache> table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + std::unique_ptr<VersionSet> versions_; + InstrumentedMutex mutex_; + std::atomic<bool> shutting_down_; + std::shared_ptr<mock::MockTableFactory> mock_table_factory_; + CompactionJobStats compaction_job_stats_; + ColumnFamilyData* cfd_; + std::unique_ptr<CompactionFilter> compaction_filter_; + std::shared_ptr<MergeOperator> merge_op_; + ErrorHandler error_handler_; + std::string full_history_ts_low_; + const std::function<std::string(uint64_t)> encode_u64_ts_; + const bool test_io_priority_; + std::function<void(Compaction& comp)> verify_per_key_placement_; + const TableTypeForTest table_type_ = kMockTable; +}; + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest : public CompactionJobTestBase { + public: + CompactionJobTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_test"), BytewiseComparator(), + [](uint64_t /*ts*/) { return ""; }, /*test_io_priority=*/false, + TableTypeForTest::kMockTable) {} +}; + +TEST_F(CompactionJobTest, Simple) { + NewDB(); + + auto expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) { + NewDB(); + + auto expected_results = CreateTwoFiles(true); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); + ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U); +} + +TEST_F(CompactionJobTest, SimpleDeletion) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"}, + {KeyStr("b", 1U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}}); + + SetLastSequence(4U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, OutputNothing) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}}); + + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile(); + + SetLastSequence(4U); + + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, SimpleOverwrite) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 3U, kTypeValue), "val2"}, + {KeyStr("b", 4U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"}, + {KeyStr("b", 0U, kTypeValue), "val3"}}); + + SetLastSequence(4U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, SimpleNonLastLevel) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + // Because level 1 is not the last level, the sequence numbers of a and b + // cannot be set to 0 + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + const std::vector<int> input_levels = {0, 1}; + auto lvl0_files = + cfd_->current()->storage_info()->LevelFiles(input_levels[0]); + auto lvl1_files = + cfd_->current()->storage_info()->LevelFiles(input_levels[1]); + RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_results}); +} + +TEST_F(CompactionJobTest, SimpleMerge) { + merge_op_ = MergeOperators::CreateStringAppendOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeValue), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, NonAssocMerge) { + merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeMerge), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +// Filters merge operands with value 10. +TEST_F(CompactionJobTest, MergeOperandFilter) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered + }); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)}, + {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}}); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, FilterSomeMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)}, + {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)}, + {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)} + // b does not appear because the operands are filtered + }); + + SetLastSequence(5U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +// Test where all operands/merge results are filtered out. +TEST_F(CompactionJobTest, FilterAllMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file3, 2); + + SetLastSequence(11U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + + mock::KVVector empty_map; + RunCompaction({files}, {input_level}, {empty_map}); +} + +TEST_F(CompactionJobTest, SimpleSingleDelete) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeDeletion), ""}, + {KeyStr("b", 6U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("a", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}}); + + SetLastSequence(6U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, SingleDeleteSnapshots) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("d", 9U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 9U, kTypeSingleDeletion), ""}, + {KeyStr("k", 12U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("l", 3U, kTypeSingleDeletion), ""}, + {KeyStr("l", 2U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("0", 2U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), "val1"}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 21U, kTypeValue), "val3"}, + {KeyStr("d", 8U, kTypeValue), "val4"}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("h", 2U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("A", 1U, kTypeValue), "val"}, + {KeyStr("e", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("c", 21U, kTypeValue), ""}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U}, 10U); +} + +TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) { + NewDB(); + + // Test multiple snapshots where the earliest snapshot is not a + // write-conflic-snapshot. + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), "val"}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), "val"}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), "val"}, + {KeyStr("G", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 23U, kTypeValue), "val2"}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("H", 24U, kTypeSingleDeletion), ""}, + {KeyStr("H", 23U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), "val2"}, + {KeyStr("I", 33U, kTypeSingleDeletion), ""}, + {KeyStr("I", 32U, kTypeValue), "val3"}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeSingleDeletion), ""}, + {KeyStr("C", 13U, kTypeValue), "val"}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val3"}, + {KeyStr("H", 14U, kTypeSingleDeletion), ""}, + {KeyStr("H", 13U, kTypeValue), "val2"}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val5"}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), ""}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), ""}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), ""}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), ""}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + + SetLastSequence(24U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U, 30U}, + 20U); +} + +TEST_F(CompactionJobTest, SingleDeleteZeroSeq) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeSingleDeletion), ""}, + {KeyStr("dummy", 5U, kTypeValue), "val2"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 0U, kTypeValue), "val"}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("dummy", 0U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {}); +} + +TEST_F(CompactionJobTest, MultiSingleDelete) { + // Tests three scenarios involving multiple single delete/put pairs: + // + // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel + // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot + // C: SDel Put SDel Snapshot Put -> Snapshot Put + // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel + // E: Put SDel Snapshot Put SDel -> Snapshot SDel + // F: Put SDel Put Sdel Snapshot -> removed + // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel + // H: (Put) Put SDel Put Sdel Snapshot -> Removed + // I: (Put) Snapshot Put SDel Put SDel -> SDel + // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put + // -> Snapshot Put + // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot Put Snapshot SDel + // L: SDel Put SDel Put SDel Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot SDel Put SDel + // M: (Put) SDel Put SDel Put SDel Snapshot Put SDel SDel Put SDel SDel + // -> SDel Snapshot Put SDel + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val5"}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), "val4"}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val"}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val"}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + {KeyStr("J", 13U, kTypeSingleDeletion), ""}, + {KeyStr("J", 12U, kTypeValue), "val"}, + {KeyStr("J", 11U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), "val1"}, + {KeyStr("K", 14U, kTypeSingleDeletion), ""}, + {KeyStr("K", 13U, kTypeSingleDeletion), ""}, + {KeyStr("K", 12U, kTypeValue), "val2"}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), "val"}, + {KeyStr("L", 14U, kTypeSingleDeletion), ""}, + {KeyStr("L", 13U, kTypeSingleDeletion), ""}, + {KeyStr("L", 12U, kTypeValue), "val"}, + {KeyStr("L", 11U, kTypeSingleDeletion), ""}, + {KeyStr("M", 16U, kTypeSingleDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), "val"}, + {KeyStr("M", 13U, kTypeSingleDeletion), ""}, + {KeyStr("M", 12U, kTypeSingleDeletion), ""}, + {KeyStr("M", 11U, kTypeValue), "val"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 11U, kTypeValue), "val2"}, + {KeyStr("C", 10U, kTypeSingleDeletion), ""}, + {KeyStr("C", 9U, kTypeValue), "val6"}, + {KeyStr("C", 8U, kTypeSingleDeletion), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), "val"}, + {KeyStr("E", 5U, kTypeSingleDeletion), ""}, + {KeyStr("E", 4U, kTypeValue), "val"}, + {KeyStr("F", 6U, kTypeSingleDeletion), ""}, + {KeyStr("F", 5U, kTypeValue), "val"}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("H", 6U, kTypeSingleDeletion), ""}, + {KeyStr("H", 5U, kTypeValue), "val"}, + {KeyStr("H", 4U, kTypeSingleDeletion), ""}, + {KeyStr("H", 3U, kTypeValue), "val"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val"}, + {KeyStr("J", 6U, kTypeSingleDeletion), ""}, + {KeyStr("J", 5U, kTypeSingleDeletion), ""}, + {KeyStr("J", 4U, kTypeValue), "val"}, + {KeyStr("J", 3U, kTypeSingleDeletion), ""}, + {KeyStr("J", 2U, kTypeValue), "val"}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("K", 7U, kTypeValue), "val4"}, + {KeyStr("K", 6U, kTypeSingleDeletion), ""}, + {KeyStr("K", 5U, kTypeValue), "val5"}, + {KeyStr("K", 2U, kTypeSingleDeletion), ""}, + {KeyStr("K", 1U, kTypeSingleDeletion), ""}, + {KeyStr("L", 5U, kTypeSingleDeletion), ""}, + {KeyStr("L", 4U, kTypeValue), "val"}, + {KeyStr("L", 3U, kTypeSingleDeletion), ""}, + {KeyStr("L", 2U, kTypeValue), "val"}, + {KeyStr("L", 1U, kTypeSingleDeletion), ""}, + {KeyStr("M", 10U, kTypeSingleDeletion), ""}, + {KeyStr("M", 7U, kTypeValue), "val"}, + {KeyStr("M", 5U, kTypeSingleDeletion), ""}, + {KeyStr("M", 4U, kTypeValue), "val"}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("D", 1U, kTypeValue), "val"}, + {KeyStr("H", 1U, kTypeValue), "val"}, + {KeyStr("I", 2U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({ + {KeyStr("M", 1U, kTypeValue), "val"}, + }); + AddMockFile(file4, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), ""}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), ""}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), ""}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), ""}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), ""}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), ""}, + {KeyStr("L", 11U, kTypeSingleDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), ""}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}}); + + SetLastSequence(22U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, {10U}, 10U); +} + +// This test documents the behavior where a corrupt key follows a deletion or a +// single deletion and the (single) deletion gets removed while the corrupt key +// gets written out. TODO(noetzli): We probably want a better way to treat +// corrupt keys. +TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"}, + {test::KeyStr("a", 5U, kTypeDeletion), ""}, + {test::KeyStr("a", 4U, kTypeValue, true), "val"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""}, + {test::KeyStr("b", 2U, kTypeValue, true), "val"}, + {test::KeyStr("c", 1U, kTypeValue), "val2"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"}, + {test::KeyStr("a", 0U, kTypeValue, true), "val"}, + {test::KeyStr("b", 0U, kTypeValue, true), "val"}, + {test::KeyStr("c", 0U, kTypeValue), "val2"}}); + + SetLastSequence(6U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, OldestBlobFileNumber) { + NewDB(); + + // Note: blob1 is inlined TTL, so it will not be considered for the purposes + // of identifying the oldest referenced blob file. Similarly, blob6 will be + // ignored because it has TTL and hence refers to a TTL blob file. + const stl_wrappers::KVMap::value_type blob1( + KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL)); + const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex), + BlobStr(59, 123456, 999)); + const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex), + BlobStr(138, 1000, 1 << 8)); + auto file1 = mock::MakeMockFile({blob1, blob2, blob3}); + AddMockFile(file1); + + const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex), + BlobStr(199, 3 << 10, 1 << 20)); + const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex), + BlobStr(19, 6789, 333)); + const stl_wrappers::KVMap::value_type blob6( + KeyStr("f", 6U, kTypeBlobIndex), + BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL)); + auto file2 = mock::MakeMockFile({blob4, blob5, blob6}); + AddMockFile(file2); + + const stl_wrappers::KVMap::value_type expected_blob1( + KeyStr("a", 0U, kTypeBlobIndex), blob1.second); + const stl_wrappers::KVMap::value_type expected_blob2( + KeyStr("b", 0U, kTypeBlobIndex), blob2.second); + const stl_wrappers::KVMap::value_type expected_blob3( + KeyStr("c", 0U, kTypeBlobIndex), blob3.second); + const stl_wrappers::KVMap::value_type expected_blob4( + KeyStr("d", 0U, kTypeBlobIndex), blob4.second); + const stl_wrappers::KVMap::value_type expected_blob5( + KeyStr("e", 0U, kTypeBlobIndex), blob5.second); + const stl_wrappers::KVMap::value_type expected_blob6( + KeyStr("f", 0U, kTypeBlobIndex), blob6.second); + auto expected_results = + mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3, + expected_blob4, expected_blob5, expected_blob6}); + + SetLastSequence(6U); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}, + std::vector<SequenceNumber>(), kMaxSequenceNumber, + /* output_level */ 1, /* verify */ true, + /* expected_oldest_blob_file_numbers */ {19}); +} + +TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { + cf_options_.bottommost_temperature = Temperature::kCold; + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast<bool*>(arg); + *supports_per_key_placement = true; + }); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + NewDB(); + + // Add files on different levels that may overlap + auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}}); + AddMockFile(file0_1); + + auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"}, + {KeyStr("f", 11U, kTypeValue), "val"}}); + AddMockFile(file1_1, 1); + auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"}, + {KeyStr("k", 13U, kTypeValue), "val"}}); + AddMockFile(file1_2, 1); + auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"}, + {KeyStr("u", 15U, kTypeValue), "val"}}); + AddMockFile(file1_3, 1); + + auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"}, + {KeyStr("h", 9U, kTypeValue), "val"}}); + AddMockFile(file2_1, 2); + auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"}, + {KeyStr("p", 7U, kTypeValue), "val"}}); + AddMockFile(file2_2, 2); + + auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"}, + {KeyStr("k", 3U, kTypeValue), "val"}}); + AddMockFile(file3_1, 3); + auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"}, + {KeyStr("x", 5U, kTypeValue), "val"}}); + AddMockFile(file3_2, 3); + + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + const std::vector<int> input_levels = {0, 1, 2, 3}; + auto files0 = cfd->current()->storage_info()->LevelFiles(input_levels[0]); + auto files1 = cfd->current()->storage_info()->LevelFiles(input_levels[1]); + auto files2 = cfd->current()->storage_info()->LevelFiles(input_levels[2]); + auto files3 = cfd->current()->storage_info()->LevelFiles(input_levels[3]); + + RunLastLevelCompaction( + {files0, files1, files2, files3}, input_levels, + /*verify_func=*/[&](Compaction& comp) { + for (char c = 'a'; c <= 'z'; c++) { + std::string c_str; + c_str = c; + const Slice key(c_str); + if (c == 'a') { + ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key)); + } else { + ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key)); + } + } + }); +} + +TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) { + db_options_.enforce_single_del_contracts = false; + NewDB(); + + auto file = + mock::MakeMockFile({{KeyStr("a", 4U, kTypeSingleDeletion), ""}, + {KeyStr("a", 3U, kTypeDeletion), "dontcare"}}); + AddMockFile(file); + SetLastSequence(4U); + + auto expected_results = mock::MakeMockFile(); + constexpr int input_level = 0; + auto files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTest, InputSerialization) { + // Setup a random CompactionServiceInput + CompactionServiceInput input; + const int kStrMaxLen = 1000; + Random rnd(static_cast<uint32_t>(time(nullptr))); + Random64 rnd64(time(nullptr)); + input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + input.column_family.options.comparator = ReverseBytewiseComparator(); + input.column_family.options.max_bytes_for_level_base = + rnd64.Uniform(UINT64_MAX); + input.column_family.options.disable_auto_compactions = rnd.OneIn(2); + input.column_family.options.compression = kZSTD; + input.column_family.options.compression_opts.level = 4; + input.db_options.max_background_flushes = 10; + input.db_options.paranoid_checks = rnd.OneIn(2); + input.db_options.statistics = CreateDBStatistics(); + input.db_options.env = env_; + while (!rnd.OneIn(10)) { + input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX)); + } + while (!rnd.OneIn(10)) { + input.input_files.emplace_back(rnd.RandomString( + rnd.Uniform(kStrMaxLen - 1) + + 1)); // input file name should have at least one character + } + input.output_level = 4; + input.has_begin = rnd.OneIn(2); + if (input.has_begin) { + input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + input.has_end = rnd.OneIn(2); + if (input.has_end) { + input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + + std::string output; + ASSERT_OK(input.Write(&output)); + + // Test deserialization + CompactionServiceInput deserialized1; + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&input)); + + // Test mismatch + deserialized1.db_options.max_background_flushes += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "db_options.max_background_flushes"); + + // Test unknown field + CompactionServiceInput deserialized2; + output.clear(); + ASSERT_OK(input.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&input)); + + // Test missing field + CompactionServiceInput deserialized3; + deserialized3.output_level = 0; + std::string to_remove = "output_level=4;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "output_level"); + + // manually set the value back, should match the original structure + deserialized3.output_level = 4; + ASSERT_TRUE(deserialized3.TEST_Equals(&input)); + + // Test invalid version + output.clear(); + ASSERT_OK(input.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceInput::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_F(CompactionJobTest, ResultSerialization) { + // Setup a random CompactionServiceResult + CompactionServiceResult result; + const int kStrMaxLen = 1000; + Random rnd(static_cast<uint32_t>(time(nullptr))); + Random64 rnd64(time(nullptr)); + std::vector<Status> status_list = { + Status::OK(), + Status::InvalidArgument("invalid option"), + Status::Aborted("failed to run"), + Status::NotSupported("not supported option"), + }; + result.status = + status_list.at(rnd.Uniform(static_cast<int>(status_list.size()))); + while (!rnd.OneIn(10)) { + UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)}; + result.output_files.emplace_back( + rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); + } + result.output_level = rnd.Uniform(10); + result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + result.num_output_records = rnd64.Uniform(UINT64_MAX); + result.total_bytes = rnd64.Uniform(UINT64_MAX); + result.bytes_read = 123; + result.bytes_written = rnd64.Uniform(UINT64_MAX); + result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX); + result.stats.num_output_files = rnd.Uniform(1000); + result.stats.is_full_compaction = rnd.OneIn(2); + result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX); + result.stats.num_input_files = 9; + + std::string output; + ASSERT_OK(result.Write(&output)); + + // Test deserialization + CompactionServiceResult deserialized1; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&result)); + + // Test mismatch + deserialized1.stats.num_input_files += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "stats.num_input_files"); + + // Test unique id mismatch + if (!result.output_files.empty()) { + CompactionServiceResult deserialized_tmp; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp)); + deserialized_tmp.output_files[0].unique_id[0] += 1; + ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "output_files.unique_id"); + deserialized_tmp.status.PermitUncheckedError(); + } + + // Test unknown field + CompactionServiceResult deserialized2; + output.clear(); + ASSERT_OK(result.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&result)); + + // Test missing field + CompactionServiceResult deserialized3; + deserialized3.bytes_read = 0; + std::string to_remove = "bytes_read=123;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "bytes_read"); + + deserialized3.bytes_read = 123; + ASSERT_TRUE(deserialized3.TEST_Equals(&result)); + + // Test invalid version + output.clear(); + ASSERT_OK(result.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceResult::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); + for (const auto& item : status_list) { + item.PermitUncheckedError(); + } +} + +class CompactionJobDynamicFileSizeTest + : public CompactionJobTestBase, + public ::testing::WithParamInterface<bool> { + public: + CompactionJobDynamicFileSizeTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_dynamic_file_size_test"), + BytewiseComparator(), [](uint64_t /*ts*/) { return ""; }, + /*test_io_priority=*/false, TableTypeForTest::kMockTable) {} +}; + +TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) { + // dynamic_file_size option should have no impact on cutting for max + // compaction bytes. + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + mutable_cf_options_.target_file_size_base = 80; + mutable_cf_options_.max_compaction_bytes = 21; + + auto file1 = mock::MakeMockFile({ + {KeyStr("c", 5U, kTypeValue), "val2"}, + {KeyStr("n", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"}, + {KeyStr("j", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + // Create three L2 files, each size 10. + // max_compaction_bytes 21 means the compaction output in L1 will + // be cut to at least two files. + auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"}, + {KeyStr("c", 1U, kTypeValue), "val"}, + {KeyStr("c1", 1U, kTypeValue), "val"}, + {KeyStr("c2", 1U, kTypeValue), "val"}, + {KeyStr("c3", 1U, kTypeValue), "val"}, + {KeyStr("c4", 1U, kTypeValue), "val"}, + {KeyStr("d", 1U, kTypeValue), "val"}, + {KeyStr("e", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("i", 1U, kTypeValue), "val"}, + {KeyStr("i1", 1U, kTypeValue), "val"}, + {KeyStr("i2", 1U, kTypeValue), "val"}, + {KeyStr("i3", 1U, kTypeValue), "val"}, + {KeyStr("i4", 1U, kTypeValue), "val"}, + {KeyStr("j", 1U, kTypeValue), "val"}, + {KeyStr("k", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"}, + {KeyStr("m", 1U, kTypeValue), "val"}, + {KeyStr("m1", 1U, kTypeValue), "val"}, + {KeyStr("m2", 1U, kTypeValue), "val"}, + {KeyStr("m3", 1U, kTypeValue), "val"}, + {KeyStr("m4", 1U, kTypeValue), "val"}, + {KeyStr("n", 1U, kTypeValue), "val"}, + {KeyStr("o", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + // The expected output should be: + // L1: [c, h, j] [n] + // L2: [b ... e] [h ... k] [l ... o] + // It's better to have "j" in the first file, because anyway it's overlapping + // with the second file on L2. + // (Note: before this PR, it was cut at "h" because it's using the internal + // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with + // seqno 1, but actually they're overlapped with the compaction picker). + + auto expected_file1 = + mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"}, + {KeyStr("h", 3U, kTypeValue), "val"}, + {KeyStr("j", 4U, kTypeValue), "val"}}); + auto expected_file2 = + mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + + const std::vector<int> input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + // Make sure the grandparent level file size (10) qualifies skipping. + // Currently, it has to be > 1/8 of target file size. + mutable_cf_options_.target_file_size_base = 70; + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("z", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"}, + {KeyStr("x", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"}, + {KeyStr("d", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("i", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"}, + {KeyStr("y", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto expected_file1 = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + auto expected_file2 = + mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"}, + {KeyStr("z", 6U, kTypeValue), "val3"}}); + + auto expected_file_disable_dynamic_file_size = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("c", 3U, kTypeValue), "val"}, + {KeyStr("x", 4U, kTypeValue), "val"}, + {KeyStr("z", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + const std::vector<int> input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size}); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + NewDB(); + + // MockTable has 1 byte per entry by default and each file is 10 bytes. + // When the file size is smaller than 100, it won't cut file earlier to align + // with its grandparent boundary. + const size_t kKeyValueSize = 10000; + mock_table_factory_->SetKeyValueSize(kKeyValueSize); + + mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize; + + mock::KVVector file1; + char ch = 'd'; + // Add value from d -> o + for (char i = 0; i < 12; i++) { + file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"}, + {KeyStr("s", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + // the 1st grandparent file should be skipped + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"}, + {KeyStr("e", 2U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"}, + {KeyStr("j", 2U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"}, + {KeyStr("n", 2U, kTypeValue), "val"}}); + AddMockFile(file6, 2); + + auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"}, + {KeyStr("t", 2U, kTypeValue), "val"}}); + AddMockFile(file7, 2); + + // The expected outputs are: + // L1: [d,e,f,g,h,i,j] [k,l,m,n,o,s] + // L2: [a, b] [c, e] [h, j] [k, n] [q, t] + // The first output cut earlier at "j", so it could be aligned with L2 files. + // If dynamic_file_size is not enabled, it will be cut based on the + // target_file_size + mock::KVVector expected_file1; + for (char i = 0; i < 7; i++) { + expected_file1.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + mock::KVVector expected_file2; + for (char i = 7; i < 12; i++) { + expected_file2.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val"); + + mock::KVVector expected_file_disable_dynamic_file_size1; + for (char i = 0; i < 10; i++) { + expected_file_disable_dynamic_file_size1.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + mock::KVVector expected_file_disable_dynamic_file_size2; + for (char i = 10; i < 12; i++) { + expected_file_disable_dynamic_file_size2.emplace_back( + KeyStr(std::string(1, ch + i), i + 10, kTypeValue), + "val" + std::to_string(i)); + } + + expected_file_disable_dynamic_file_size2.emplace_back( + KeyStr("s", 4U, kTypeValue), "val"); + + SetLastSequence(22U); + const std::vector<int> input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size1, + expected_file_disable_dynamic_file_size2}); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) { + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + NewDB(); + + // MockTable has 1 byte per entry by default and each file is 10 bytes. + // When the file size is smaller than 100, it won't cut file earlier to align + // with its grandparent boundary. + const size_t kKeyValueSize = 10000; + mock_table_factory_->SetKeyValueSize(kKeyValueSize); + + mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize; + + mock::KVVector file1; + for (int i = 0; i < 7; i++) { + file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), + "val" + std::to_string(100 - i)); + } + file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb"); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"}, + {KeyStr("b", 90U, kTypeValue), "valb"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"}, + {KeyStr("a", 88U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"}, + {KeyStr("a", 86U, kTypeValue), "val"}}); + AddMockFile(file4, 2); + + auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"}, + {KeyStr("b", 84U, kTypeValue), "val"}}); + AddMockFile(file5, 2); + + mock::KVVector expected_file1; + mock::KVVector expected_file_disable_dynamic_file_size; + + for (int i = 0; i < 8; i++) { + expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue), + "val" + std::to_string(100 - i)); + expected_file_disable_dynamic_file_size.emplace_back( + KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i)); + } + + // make sure `b` is cut in a separated file (so internally it's not using + // internal comparator, which will think the "b:90" (seqno 90) here is smaller + // than "b:85" on L2.) + auto expected_file2 = + mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}}); + + expected_file_disable_dynamic_file_size.emplace_back( + KeyStr("b", 90U, kTypeValue), "valb"); + + SetLastSequence(122U); + const std::vector<int> input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + // Just keep all the history + std::vector<SequenceNumber> snapshots; + for (int i = 80; i <= 100; i++) { + snapshots.emplace_back(i); + } + if (enable_dyanmic_file_size) { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2}, snapshots); + } else { + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file_disable_dynamic_file_size}, snapshots); + } +} + +TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) { + // dynamic_file_size option should have no impact on cutting for max + // compaction bytes. + bool enable_dyanmic_file_size = GetParam(); + cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size; + + NewDB(); + mutable_cf_options_.target_file_size_base = 80; + mutable_cf_options_.max_compaction_bytes = 20; + + auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"}, + {KeyStr("b", 103U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"}, + {KeyStr("c", 101U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + for (int i = 0; i < 10; i++) { + auto file = + mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"}, + {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}}); + AddMockFile(file, 2); + } + + for (int i = 0; i < 10; i++) { + auto file = + mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"}, + {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}}); + AddMockFile(file, 2); + } + + auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"}, + {KeyStr("c", 59U, kTypeValue), "valc"}}); + + // "a" has 10 overlapped grandparent files (each size 10), which is far + // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated, + // as splitting them won't help reducing the compaction size. + // also make sure "b" and "c" are cut separately. + mock::KVVector expected_file1 = + mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"}, + {KeyStr("a", 102U, kTypeValue), "val2"}}); + mock::KVVector expected_file2 = + mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}}); + mock::KVVector expected_file3 = + mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}}); + + SetLastSequence(122U); + const std::vector<int> input_levels = {0, 1}; + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + + // Just keep all the history + std::vector<SequenceNumber> snapshots; + for (int i = 80; i <= 105; i++) { + snapshots.emplace_back(i); + } + RunCompaction({lvl0_files, lvl1_files}, input_levels, + {expected_file1, expected_file2, expected_file3}, snapshots); +} + +INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest, + CompactionJobDynamicFileSizeTest, testing::Bool()); + +class CompactionJobTimestampTest : public CompactionJobTestBase { + public: + CompactionJobTimestampTest() + : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"), + test::BytewiseComparatorWithU64TsWrapper(), + test::EncodeInt, /*test_io_priority=*/false, + TableTypeForTest::kMockTable) {} +}; + +TEST_F(CompactionJobTimestampTest, GCDisabled) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"}, + {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}}); + AddMockFile(file2); + + SetLastSequence(10); + + auto expected_results = mock::MakeMockFile( + {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"}, + {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}, + {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTimestampTest, NoKeyExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + AddMockFile(file2); + + SetLastSequence(101); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + full_history_ts_low_ = encode_u64_ts_(0); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTimestampTest, AllKeysExpired) { + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""}, + {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""}, + {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"}, + {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"}, + {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}}); + AddMockFile(file2); + + SetLastSequence(7); + + auto expected_results = + mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max()); + RunCompaction({files}, {input_level}, {expected_results}); +} + +TEST_F(CompactionJobTimestampTest, SomeKeysExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"}, + {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"}, + {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}}); + AddMockFile(file2); + + SetLastSequence(6); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + constexpr int input_level = 0; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + full_history_ts_low_ = encode_u64_ts_(49); + RunCompaction({files}, {input_level}, {expected_results}); +} + +class CompactionJobTimestampTestWithBbTable : public CompactionJobTestBase { + public: + // Block-based table is needed if we want to test subcompaction partitioning + // with anchors. + explicit CompactionJobTimestampTestWithBbTable() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_ts_bbt_test"), + test::BytewiseComparatorWithU64TsWrapper(), test::EncodeInt, + /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {} +}; + +TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionAnchorL1) { + cf_options_.target_file_size_base = 20; + mutable_cf_options_.target_file_size_base = 20; + NewDB(); + + const std::vector<std::string> keys = { + KeyStr("a", 20, ValueType::kTypeValue, 200), + KeyStr("b", 21, ValueType::kTypeValue, 210), + KeyStr("b", 20, ValueType::kTypeValue, 200), + KeyStr("b", 18, ValueType::kTypeValue, 180), + KeyStr("c", 17, ValueType::kTypeValue, 170), + KeyStr("c", 16, ValueType::kTypeValue, 160), + KeyStr("c", 15, ValueType::kTypeValue, 150)}; + const std::vector<std::string> values = {"a20", "b21", "b20", "b18", + "c17", "c16", "c15"}; + + constexpr int input_level = 1; + + auto file1 = mock::MakeMockFile( + {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}}); + AddMockFile(file1, input_level); + + auto file2 = mock::MakeMockFile( + {{keys[3], values[3]}, {keys[4], values[4]}, {keys[5], values[5]}}); + AddMockFile(file2, input_level); + + auto file3 = mock::MakeMockFile({{keys[6], values[6]}}); + AddMockFile(file3, input_level); + + SetLastSequence(20); + + auto output1 = mock::MakeMockFile({{keys[0], values[0]}}); + auto output2 = mock::MakeMockFile( + {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}}); + auto output3 = mock::MakeMockFile( + {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}}); + + auto expected_results = + std::vector<mock::KVVector>{output1, output2, output3}; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + constexpr int output_level = 2; + constexpr int max_subcompactions = 4; + RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{}, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + output_level, /*verify=*/true, {kInvalidBlobFileNumber}, + /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL, + max_subcompactions); +} + +TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionL0) { + cf_options_.target_file_size_base = 20; + mutable_cf_options_.target_file_size_base = 20; + NewDB(); + + const std::vector<std::string> keys = { + KeyStr("a", 20, ValueType::kTypeValue, 200), + KeyStr("b", 20, ValueType::kTypeValue, 200), + KeyStr("b", 19, ValueType::kTypeValue, 190), + KeyStr("b", 18, ValueType::kTypeValue, 180), + KeyStr("c", 17, ValueType::kTypeValue, 170), + KeyStr("c", 16, ValueType::kTypeValue, 160), + KeyStr("c", 15, ValueType::kTypeValue, 150)}; + const std::vector<std::string> values = {"a20", "b20", "b19", "b18", + "c17", "c16", "c15"}; + + constexpr int input_level = 0; + + auto file1 = mock::MakeMockFile({{keys[5], values[5]}, {keys[6], values[6]}}); + AddMockFile(file1, input_level); + + auto file2 = mock::MakeMockFile({{keys[3], values[3]}, {keys[4], values[4]}}); + AddMockFile(file2, input_level); + + auto file3 = mock::MakeMockFile( + {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}}); + AddMockFile(file3, input_level); + + SetLastSequence(20); + + auto output1 = mock::MakeMockFile({{keys[0], values[0]}}); + auto output2 = mock::MakeMockFile( + {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}}); + auto output3 = mock::MakeMockFile( + {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}}); + + auto expected_results = + std::vector<mock::KVVector>{output1, output2, output3}; + const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level); + + constexpr int output_level = 1; + constexpr int max_subcompactions = 4; + RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{}, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + output_level, /*verify=*/true, {kInvalidBlobFileNumber}, + /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL, + max_subcompactions); +} + +// The io priority of the compaction reads and writes are different from +// other DB reads and writes. To prepare the compaction input files, use the +// default filesystem from Env. To test the io priority of the compaction +// reads and writes, db_options_.fs is set as MockTestFileSystem. +class CompactionJobIOPriorityTest : public CompactionJobTestBase { + public: + CompactionJobIOPriorityTest() + : CompactionJobTestBase( + test::PerThreadDBPath("compaction_job_io_priority_test"), + BytewiseComparator(), [](uint64_t /*ts*/) { return ""; }, + /*test_io_priority=*/true, TableTypeForTest::kBlockBasedTable) {} +}; + +TEST_F(CompactionJobIOPriorityTest, WriteControllerStateNormal) { + // When the state from WriteController is normal. + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false, + Env::IO_LOW, Env::IO_LOW); +} + +TEST_F(CompactionJobIOPriorityTest, WriteControllerStateDelayed) { + // When the state from WriteController is Delayed. + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + { + std::unique_ptr<WriteControllerToken> delay_token = + write_controller_.GetDelayToken(1000000); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false, + Env::IO_USER, Env::IO_USER); + } +} + +TEST_F(CompactionJobIOPriorityTest, WriteControllerStateStalled) { + // When the state from WriteController is Stalled. + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + { + std::unique_ptr<WriteControllerToken> stop_token = + write_controller_.GetStopToken(); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false, + Env::IO_USER, Env::IO_USER); + } +} + +TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) { + NewDB(); + mock::KVVector expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + constexpr int input_level = 0; + auto files = cfd->current()->storage_info()->LevelFiles(input_level); + ASSERT_EQ(2U, files.size()); + RunCompaction({files}, {input_level}, {expected_results}, {}, + kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true, + Env::IO_LOW, Env::IO_LOW); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_outputs.cc b/src/rocksdb/db/compaction/compaction_outputs.cc new file mode 100644 index 000000000..e74378e2a --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_outputs.cc @@ -0,0 +1,646 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_outputs.h" + +#include "db/builder.h" + +namespace ROCKSDB_NAMESPACE { + +void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { + builder_.reset(NewTableBuilder(tboptions, file_writer_.get())); +} + +Status CompactionOutputs::Finish(const Status& intput_status, + const SeqnoToTimeMapping& seqno_time_mapping) { + FileMetaData* meta = GetMetaData(); + assert(meta != nullptr); + Status s = intput_status; + if (s.ok()) { + std::string seqno_time_mapping_str; + seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str, + meta->oldest_ancester_time); + s = builder_->Finish(); + + } else { + builder_->Abandon(); + } + Status io_s = builder_->io_status(); + if (s.ok()) { + s = io_s; + } else { + io_s.PermitUncheckedError(); + } + const uint64_t current_bytes = builder_->FileSize(); + if (s.ok()) { + meta->fd.file_size = current_bytes; + meta->marked_for_compaction = builder_->NeedCompact(); + } + current_output().finished = true; + stats_.bytes_written += current_bytes; + stats_.num_output_files = outputs_.size(); + + return s; +} + +IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, + SystemClock* clock, + Statistics* statistics, + bool use_fsync) { + IOStatus io_s; + if (input_status.ok()) { + StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS); + io_s = file_writer_->Sync(use_fsync); + } + if (input_status.ok() && io_s.ok()) { + io_s = file_writer_->Close(); + } + + if (input_status.ok() && io_s.ok()) { + FileMetaData* meta = GetMetaData(); + meta->file_checksum = file_writer_->GetFileChecksum(); + meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName(); + } + + file_writer_.reset(); + + return io_s; +} + +size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( + const Slice& internal_key) { + size_t curr_key_boundary_switched_num = 0; + const std::vector<FileMetaData*>& grandparents = compaction_->grandparents(); + + if (grandparents.empty()) { + return curr_key_boundary_switched_num; + } + assert(!internal_key.empty()); + InternalKey ikey; + ikey.DecodeFrom(internal_key); + assert(ikey.Valid()); + + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + + // Move the grandparent_index_ to the file containing the current user_key. + // If there are multiple files containing the same user_key, make sure the + // index points to the last file containing the key. + while (grandparent_index_ < grandparents.size()) { + if (being_grandparent_gap_) { + if (sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) < 0) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_overlapped_bytes_ += + grandparents[grandparent_index_]->fd.GetFileSize(); + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = false; + } else { + int cmp_result = sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_]->largest); + // If it's same key, make sure grandparent_index_ is pointing to the last + // one. + if (cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_ + 1]->smallest) < + 0))) { + break; + } + if (seen_key_) { + curr_key_boundary_switched_num++; + grandparent_boundary_switched_num_++; + } + being_grandparent_gap_ = true; + grandparent_index_++; + } + } + + // If the first key is in the middle of a grandparent file, adding it to the + // overlap + if (!seen_key_ && !being_grandparent_gap_) { + assert(grandparent_overlapped_bytes_ == 0); + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(internal_key); + } + + seen_key_ = true; + return curr_key_boundary_switched_num; +} + +uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const { + // no overlap with any grandparent file + if (being_grandparent_gap_) { + return 0; + } + uint64_t overlapped_bytes = 0; + + const std::vector<FileMetaData*>& grandparents = compaction_->grandparents(); + const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); + InternalKey ikey; + ikey.DecodeFrom(internal_key); +#ifndef NDEBUG + // make sure the grandparent_index_ is pointing to the last files containing + // the current key. + int cmp_result = + sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest); + assert( + cmp_result < 0 || + (cmp_result == 0 && + (grandparent_index_ == grandparents.size() - 1 || + sstableKeyCompare( + ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0))); + assert(sstableKeyCompare(ucmp, ikey, + grandparents[grandparent_index_]->smallest) >= 0); +#endif + overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize(); + + // go backwards to find all overlapped files, one key can overlap multiple + // files. In the following example, if the current output key is `c`, and one + // compaction file was cut before `c`, current `c` can overlap with 3 files: + // [a b] [c... + // [b, b] [c, c] [c, c] [c, d] + for (int64_t i = static_cast<int64_t>(grandparent_index_) - 1; + i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0; + i--) { + overlapped_bytes += grandparents[i]->fd.GetFileSize(); + } + + return overlapped_bytes; +} + +bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { + assert(c_iter.Valid()); + + // always update grandparent information like overlapped file number, size + // etc. + const Slice& internal_key = c_iter.key(); + const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; + size_t num_grandparent_boundaries_crossed = + UpdateGrandparentBoundaryInfo(internal_key); + + if (!HasBuilder()) { + return false; + } + + // If there's user defined partitioner, check that first + if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( + last_key_for_partitioner_, c_iter.user_key(), + current_output_file_size_)) == kRequired) { + return true; + } + + // files output to Level 0 won't be split + if (compaction_->output_level() == 0) { + return false; + } + + // reach the max file size + if (current_output_file_size_ >= compaction_->max_output_file_size()) { + return true; + } + + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + + // Check if it needs to split for RoundRobin + // Invalid local_output_split_key indicates that we do not need to split + if (local_output_split_key_ != nullptr && !is_split_) { + // Split occurs when the next key is larger than/equal to the cursor + if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) { + is_split_ = true; + return true; + } + } + + // only check if the current key is going to cross the grandparents file + // boundary (either the file beginning or ending). + if (num_grandparent_boundaries_crossed > 0) { + // Cut the file before the current key if the size of the current output + // file + its overlapped grandparent files is bigger than + // max_compaction_bytes. Which is to prevent future bigger than + // max_compaction_bytes compaction from the current output level. + if (grandparent_overlapped_bytes_ + current_output_file_size_ > + compaction_->max_compaction_bytes()) { + return true; + } + + // Cut the file if including the key is going to add a skippable file on + // the grandparent level AND its size is reasonably big (1/8 of target file + // size). For example, if it's compacting the files L0 + L1: + // L0: [1, 21] + // L1: [3, 23] + // L2: [2, 4] [11, 15] [22, 24] + // Without this break, it will output as: + // L1: [1,3, 21,23] + // With this break, it will output as (assuming [11, 15] at L2 is bigger + // than 1/8 of target size): + // L1: [1,3] [21,23] + // Then for the future compactions, [11,15] won't be included. + // For random datasets (either evenly distributed or skewed), it rarely + // triggers this condition, but if the user is adding 2 different datasets + // without any overlap, it may likely happen. + // More details, check PR #1963 + const size_t num_skippable_boundaries_crossed = + being_grandparent_gap_ ? 2 : 3; + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + num_grandparent_boundaries_crossed >= + num_skippable_boundaries_crossed && + grandparent_overlapped_bytes_ - previous_overlapped_bytes > + compaction_->target_output_file_size() / 8) { + return true; + } + + // Pre-cut the output file if it's reaching a certain size AND it's at the + // boundary of a grandparent file. It can reduce the future compaction size, + // the cost is having smaller files. + // The pre-cut size threshold is based on how many grandparent boundaries + // it has seen before. Basically, if it has seen no boundary at all, then it + // will pre-cut at 50% target file size. Every boundary it has seen + // increases the threshold by 5%, max at 90%, which it will always cut. + // The idea is based on if it has seen more boundaries before, it will more + // likely to see another boundary (file cutting opportunity) before the + // target file size. The test shows it can generate larger files than a + // static threshold like 75% and has a similar write amplification + // improvement. + if (compaction_->immutable_options()->compaction_style == + kCompactionStyleLevel && + compaction_->immutable_options()->level_compaction_dynamic_file_size && + current_output_file_size_ >= + ((compaction_->target_output_file_size() + 99) / 100) * + (50 + std::min(grandparent_boundary_switched_num_ * 5, + size_t{40}))) { + return true; + } + } + + // check ttl file boundaries if there's any + if (!files_to_cut_for_ttl_.empty()) { + if (cur_files_to_cut_for_ttl_ != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; + cur_files_to_cut_for_ttl_ = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl_ < + static_cast<int>(files_to_cut_for_ttl_.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl_++; + } else { + // Still fall into the gap + break; + } + } + } + } + + return false; +} + +Status CompactionOutputs::AddToOutput( + const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status s; + const Slice& key = c_iter.key(); + + if (ShouldStopBefore(c_iter) && HasBuilder()) { + s = close_file_func(*this, c_iter.InputStatus(), key); + if (!s.ok()) { + return s; + } + // reset grandparent information + grandparent_boundary_switched_num_ = 0; + grandparent_overlapped_bytes_ = + GetCurrentKeyGrandparentOverlappedBytes(key); + } + + // Open output file if necessary + if (!HasBuilder()) { + s = open_file_func(*this); + if (!s.ok()) { + return s; + } + } + + assert(builder_ != nullptr); + const Slice& value = c_iter.value(); + s = current_output().validator.Add(key, value); + if (!s.ok()) { + return s; + } + builder_->Add(key, value); + + stats_.num_output_records++; + current_output_file_size_ = builder_->EstimatedFileSize(); + + if (blob_garbage_meter_) { + s = blob_garbage_meter_->ProcessOutFlow(key, value); + } + + if (!s.ok()) { + return s; + } + + const ParsedInternalKey& ikey = c_iter.ikey(); + s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence, + ikey.type); + + if (partitioner_) { + last_key_for_partitioner_.assign(c_iter.user_key().data_, + c_iter.user_key().size_); + } + + return s; +} + +Status CompactionOutputs::AddRangeDels( + const Slice* comp_start_user_key, const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, bool bottommost_level, + const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, + const Slice& next_table_min_key, const std::string& full_history_ts_low) { + assert(HasRangeDel()); + FileMetaData& meta = current_output().meta; + const Comparator* ucmp = icmp.user_comparator(); + + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; + + size_t output_size = outputs_.size(); + if (output_size == 1) { + // For the first output table, include range tombstones before the min + // key but after the subcompaction boundary. + lower_bound = comp_start_user_key; + lower_bound_from_sub_compact = true; + } else if (meta.smallest.size() > 0) { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/); + lower_bound_guard = Slice(smallest_user_key); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + if (!next_table_min_key.empty()) { + // This may be the last file in the subcompaction in some cases, so we + // need to compare the end key of subcompaction with the next file start + // key. When the end key is chosen by the subcompaction, we know that + // it must be the biggest key in output file. Therefore, it is safe to + // use the smaller key as the upper bound of the output file, to ensure + // that there is no overlapping between different output files. + upper_bound_guard = ExtractUserKey(next_table_min_key); + if (comp_end_user_key != nullptr && + ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >= + 0) { + upper_bound = comp_end_user_key; + } else { + upper_bound = &upper_bound_guard; + } + } else { + // This is the last file in the subcompaction, so extend until the + // subcompaction ends. + upper_bound = comp_end_user_key; + } + bool has_overlapping_endpoints; + if (upper_bound != nullptr && meta.largest.size() > 0) { + has_overlapping_endpoints = ucmp->CompareWithoutTimestamp( + meta.largest.user_key(), *upper_bound) == 0; + } else { + has_overlapping_endpoints = false; + } + + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(comp_end_user_key == nullptr || upper_bound == nullptr || + ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0); + auto it = range_del_agg_->NewIterator(lower_bound, upper_bound, + has_overlapping_endpoints); + // Position the range tombstone output iterator. There may be tombstone + // fragments that are entirely out of range, so make sure that we do not + // include those. + if (lower_bound != nullptr) { + it->Seek(*lower_bound); + } else { + it->SeekToFirst(); + } + for (; it->Valid(); it->Next()) { + auto tombstone = it->Tombstone(); + if (upper_bound != nullptr) { + int cmp = + ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_); + if ((has_overlapping_endpoints && cmp < 0) || + (!has_overlapping_endpoints && cmp <= 0)) { + // Tombstones starting after upper_bound only need to be included in + // the next table. If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range + // tombstones will be included in the next file and are not relevant + // to the point keys or endpoints of the current file. + break; + } + } + + const size_t ts_sz = ucmp->timestamp_size(); + // Garbage collection for range tombstones. + // If user-defined timestamp is enabled, range tombstones are dropped if + // they are at bottommost_level, below full_history_ts_low and not visible + // in any snapshot. trim_ts_ is passed to the constructor for + // range_del_agg_, and range_del_agg_ internally drops tombstones above + // trim_ts_. + if (bottommost_level && tombstone.seq_ <= earliest_snapshot && + (ts_sz == 0 || + (!full_history_ts_low.empty() && + ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) { + // TODO(andrewkr): tombstones that span multiple output files are + // counted for each compaction output file, so lots of double + // counting. + range_del_out_stats.num_range_del_drop_obsolete++; + range_del_out_stats.num_record_drop_obsolete++; + continue; + } + + auto kv = tombstone.Serialize(); + assert(lower_bound == nullptr || + ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0); + // Range tombstone is not supported by output validator yet. + builder_->Add(kv.first.Encode(), kv.second); + InternalKey smallest_candidate = std::move(kv.first); + if (lower_bound != nullptr && + ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(), + *lower_bound) <= 0) { + // Pretend the smallest key has the same user key as lower_bound + // (the max key in the previous table or subcompaction) in order for + // files to appear key-space partitioned. + // + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, + // because otherwise the subcompaction woud be unbounded on the left. + // As a result, we know that no other files on the output level will + // contain actual keys at lower_bound (an output file may have a + // largest key of lower_bound@kMaxSequenceNumber, but this only + // indicates a large range tombstone was truncated). Therefore, it is + // safe to use the tombstone's sequence number, to ensure that keys at + // lower_bound at lower levels are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes + // after the previous file's largest. The fake seqnum is OK because + // the read path's file-picking code only considers user key. + if (lower_bound_from_sub_compact) { + if (ts_sz) { + assert(tombstone.ts_.size() == ts_sz); + smallest_candidate = InternalKey(*lower_bound, tombstone.seq_, + kTypeRangeDeletion, tombstone.ts_); + } else { + smallest_candidate = + InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion); + } + } else { + smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); + } + } + InternalKey largest_candidate = tombstone.SerializeEndKey(); + if (upper_bound != nullptr && + ucmp->CompareWithoutTimestamp(*upper_bound, + largest_candidate.user_key()) <= 0) { + // Pretend the largest key has the same user key as upper_bound (the + // min key in the following table or subcompaction) in order for files + // to appear key-space partitioned. + // + // Choose highest seqnum so this file's largest internal key comes + // before the next file's/subcompaction's smallest. The fake seqnum is + // OK because the read path's file-picking code only considers the + // user key portion. + // + // Note Seek() also creates InternalKey with (user_key, + // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of + // kTypeRangeDeletion (0xF), so the range tombstone comes before the + // Seek() key in InternalKey's ordering. So Seek() will look in the + // next file for the user key + if (ts_sz) { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_sz <= strlen(kTsMax)) { + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion, + Slice(kTsMax, ts_sz)); + } else { + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion, + std::string(ts_sz, '\xff')); + } + } else { + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); + } + } +#ifndef NDEBUG + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta.smallest.size() > 0) { + smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode()); + } +#endif + meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate, + tombstone.seq_, icmp); + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta.smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); + } + return Status::OK(); +} + +void CompactionOutputs::FillFilesToCutForTtl() { + if (compaction_->immutable_options()->compaction_style != + kCompactionStyleLevel || + compaction_->immutable_options()->compaction_pri != + kMinOverlappingRatio || + compaction_->mutable_cf_options()->ttl == 0 || + compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) { + return; + } + + // We define new file with the oldest ancestor time to be younger than 1/4 + // TTL, and an old one to be older than 1/2 TTL time. + int64_t temp_current_time; + auto get_time_status = + compaction_->immutable_options()->clock->GetCurrentTime( + &temp_current_time); + if (!get_time_status.ok()) { + return; + } + + auto current_time = static_cast<uint64_t>(temp_current_time); + if (current_time < compaction_->mutable_cf_options()->ttl) { + return; + } + + uint64_t old_age_thres = + current_time - compaction_->mutable_cf_options()->ttl / 2; + const std::vector<FileMetaData*>& olevel = + *(compaction_->inputs(compaction_->num_input_levels() - 1)); + for (FileMetaData* file : olevel) { + // Worth filtering out by start and end? + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + // We put old files if they are not too small to prevent a flood + // of small files. + if (oldest_ancester_time < old_age_thres && + file->fd.GetFileSize() > + compaction_->mutable_cf_options()->target_file_size_base / 2) { + files_to_cut_for_ttl_.push_back(file); + } + } +} + +CompactionOutputs::CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level) + : compaction_(compaction), is_penultimate_level_(is_penultimate_level) { + partitioner_ = compaction->output_level() == 0 + ? nullptr + : compaction->CreateSstPartitioner(); + + if (compaction->output_level() != 0) { + FillFilesToCutForTtl(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_outputs.h b/src/rocksdb/db/compaction/compaction_outputs.h new file mode 100644 index 000000000..f40aa8215 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_outputs.h @@ -0,0 +1,385 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionOutputs; +using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>; +using CompactionFileCloseFunc = + std::function<Status(CompactionOutputs&, const Status&, const Slice&)>; + +// Files produced by subcompaction, most of the functions are used by +// compaction_job Open/Close compaction file functions. +class CompactionOutputs { + public: + // compaction output file + struct Output { + Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, + bool _enable_order_check, bool _enable_hash, bool _finished, + uint64_t precalculated_hash) + : meta(std::move(_meta)), + validator(_icmp, _enable_order_check, _enable_hash, + precalculated_hash), + finished(_finished) {} + FileMetaData meta; + OutputValidator validator; + bool finished; + std::shared_ptr<const TableProperties> table_properties; + }; + + CompactionOutputs() = delete; + + explicit CompactionOutputs(const Compaction* compaction, + const bool is_penultimate_level); + + // Add generated output to the list + void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + bool finished = false, uint64_t precalculated_hash = 0) { + outputs_.emplace_back(std::move(meta), icmp, enable_order_check, + enable_hash, finished, precalculated_hash); + } + + // Set new table builder for the current output + void NewBuilder(const TableBuilderOptions& tboptions); + + // Assign a new WritableFileWriter to the current output + void AssignFileWriter(WritableFileWriter* writer) { + file_writer_.reset(writer); + } + + // TODO: Remove it when remote compaction support tiered compaction + void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; } + void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } + + // TODO: Move the BlobDB builder into CompactionOutputs + const std::vector<BlobFileAddition>& GetBlobFileAdditions() const { + if (is_penultimate_level_) { + assert(blob_file_additions_.empty()); + } + return blob_file_additions_; + } + + std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() { + assert(!is_penultimate_level_); + return &blob_file_additions_; + } + + bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); } + + BlobGarbageMeter* CreateBlobGarbageMeter() { + assert(!is_penultimate_level_); + blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>(); + return blob_garbage_meter_.get(); + } + + BlobGarbageMeter* GetBlobGarbageMeter() const { + if (is_penultimate_level_) { + // blobdb doesn't support per_key_placement yet + assert(blob_garbage_meter_ == nullptr); + return nullptr; + } + return blob_garbage_meter_.get(); + } + + void UpdateBlobStats() { + assert(!is_penultimate_level_); + stats_.num_output_files_blob = blob_file_additions_.size(); + for (const auto& blob : blob_file_additions_) { + stats_.bytes_written_blob += blob.GetTotalBlobBytes(); + } + } + + // Finish the current output file + Status Finish(const Status& intput_status, + const SeqnoToTimeMapping& seqno_time_mapping); + + // Update output table properties from table builder + void UpdateTableProperties() { + current_output().table_properties = + std::make_shared<TableProperties>(GetTableProperties()); + } + + IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock, + Statistics* statistics, bool use_fsync); + + TableProperties GetTableProperties() { + return builder_->GetTableProperties(); + } + + Slice SmallestUserKey() const { + if (!outputs_.empty() && outputs_[0].finished) { + return outputs_[0].meta.smallest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + Slice LargestUserKey() const { + if (!outputs_.empty() && outputs_.back().finished) { + return outputs_.back().meta.largest.user_key(); + } else { + return Slice{nullptr, 0}; + } + } + + // In case the last output file is empty, which doesn't need to keep. + void RemoveLastEmptyOutput() { + if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) { + // An error occurred, so ignore the last output. + outputs_.pop_back(); + } + } + + // Remove the last output, for example the last output doesn't have data (no + // entry and no range-dels), but file_size might not be 0, as it has SST + // metadata. + void RemoveLastOutput() { + assert(!outputs_.empty()); + outputs_.pop_back(); + } + + bool HasBuilder() const { return builder_ != nullptr; } + + FileMetaData* GetMetaData() { return ¤t_output().meta; } + + bool HasOutput() const { return !outputs_.empty(); } + + uint64_t NumEntries() const { return builder_->NumEntries(); } + + void ResetBuilder() { + builder_.reset(); + current_output_file_size_ = 0; + } + + // Add range-dels from the aggregator to the current output file + // @param comp_start_user_key and comp_end_user_key include timestamp if + // user-defined timestamp is enabled. + // @param full_history_ts_low used for range tombstone garbage collection. + Status AddRangeDels(const Slice* comp_start_user_key, + const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, + bool bottommost_level, const InternalKeyComparator& icmp, + SequenceNumber earliest_snapshot, + const Slice& next_table_min_key, + const std::string& full_history_ts_low); + + // if the outputs have range delete, range delete is also data + bool HasRangeDel() const { + return range_del_agg_ && !range_del_agg_->IsEmpty(); + } + + private: + friend class SubcompactionState; + + void FillFilesToCutForTtl(); + + void SetOutputSlitKey(const std::optional<Slice> start, + const std::optional<Slice> end) { + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + + const InternalKey* output_split_key = compaction_->GetOutputSplitKey(); + // Invalid output_split_key indicates that we do not need to split + if (output_split_key != nullptr) { + // We may only split the output when the cursor is in the range. Split + if ((!end.has_value() || + icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), end.value()) < 0) && + (!start.has_value() || icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), + start.value()) > 0)) { + local_output_split_key_ = output_split_key; + } + } + } + + // Returns true iff we should stop building the current output + // before processing the current key in compaction iterator. + bool ShouldStopBefore(const CompactionIterator& c_iter); + + void Cleanup() { + if (builder_ != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + builder_->Abandon(); + builder_.reset(); + } + } + + // update tracked grandparents information like grandparent index, if it's + // in the gap between 2 grandparent files, accumulated grandparent files size + // etc. + // It returns how many boundaries it crosses by including current key. + size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key); + + // helper function to get the overlapped grandparent files size, it's only + // used for calculating the first key's overlap. + uint64_t GetCurrentKeyGrandparentOverlappedBytes( + const Slice& internal_key) const; + + // Add current key from compaction_iterator to the output file. If needed + // close and open new compaction output with the functions provided. + Status AddToOutput(const CompactionIterator& c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close the current output. `open_file_func` is needed for creating new file + // for range-dels only output file. + Status CloseOutput(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + Status status = curr_status; + // handle subcompaction containing only range deletions + if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) { + status = open_file_func(*this); + } + if (HasBuilder()) { + const Slice empty_key{}; + Status s = close_file_func(*this, status, empty_key); + if (!s.ok() && status.ok()) { + status = s; + } + } + + return status; + } + + // This subcompaction's output could be empty if compaction was aborted before + // this subcompaction had a chance to generate any output files. When + // subcompactions are executed sequentially this is more likely and will be + // particularly likely for the later subcompactions to be empty. Once they are + // run in parallel however it should be much rarer. + // It's caller's responsibility to make sure it's not empty. + Output& current_output() { + assert(!outputs_.empty()); + return outputs_.back(); + } + + // Assign the range_del_agg to the target output level. There's only one + // range-del-aggregator per compaction outputs, for + // output_to_penultimate_level compaction it is only assigned to the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) { + assert(range_del_agg_ == nullptr); + range_del_agg_ = std::move(range_del_agg); + } + + const Compaction* compaction_; + + // current output builder and writer + std::unique_ptr<TableBuilder> builder_; + std::unique_ptr<WritableFileWriter> file_writer_; + uint64_t current_output_file_size_ = 0; + + // all the compaction outputs so far + std::vector<Output> outputs_; + + // BlobDB info + std::vector<BlobFileAddition> blob_file_additions_; + std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_; + + // Basic compaction output stats for this level's outputs + InternalStats::CompactionOutputsStats stats_; + + // indicate if this CompactionOutputs obj for penultimate_level, should always + // be false if per_key_placement feature is not enabled. + const bool is_penultimate_level_; + std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr; + + // partitioner information + std::string last_key_for_partitioner_; + std::unique_ptr<SstPartitioner> partitioner_; + + // A flag determines if this subcompaction has been split by the cursor + bool is_split_ = false; + + // We also maintain the output split key for each subcompaction to avoid + // repetitive comparison in ShouldStopBefore() + const InternalKey* local_output_split_key_ = nullptr; + + // Some identified files with old oldest ancester time and the range should be + // isolated out so that the output file(s) in that range can be merged down + // for TTL and clear the timestamps for the range. + std::vector<FileMetaData*> files_to_cut_for_ttl_; + int cur_files_to_cut_for_ttl_ = -1; + int next_files_to_cut_for_ttl_ = 0; + + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index_ = 0; + + // if the output key is being grandparent files gap, so: + // key > grandparents[grandparent_index_ - 1].largest && + // key < grandparents[grandparent_index_].smallest + bool being_grandparent_gap_ = true; + + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t grandparent_overlapped_bytes_ = 0; + + // A flag determines whether the key has been seen in ShouldStopBefore() + bool seen_key_ = false; + + // for the current output file, how many file boundaries has it crossed, + // basically number of files overlapped * 2 + size_t grandparent_boundary_switched_num_ = 0; +}; + +// helper struct to concatenate the last level and penultimate level outputs +// which could be replaced by std::ranges::join_view() in c++20 +struct OutputIterator { + public: + explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a, + const std::vector<CompactionOutputs::Output>& b) + : a_(a), b_(b) { + within_a = !a_.empty(); + idx_ = 0; + } + + OutputIterator begin() { return *this; } + + OutputIterator end() { return *this; } + + size_t size() { return a_.size() + b_.size(); } + + const CompactionOutputs::Output& operator*() const { + return within_a ? a_[idx_] : b_[idx_]; + } + + OutputIterator& operator++() { + idx_++; + if (within_a && idx_ >= a_.size()) { + within_a = false; + idx_ = 0; + } + assert(within_a || idx_ <= b_.size()); + return *this; + } + + bool operator!=(const OutputIterator& /*rhs*/) const { + return within_a || idx_ < b_.size(); + } + + private: + const std::vector<CompactionOutputs::Output>& a_; + const std::vector<CompactionOutputs::Output>& b_; + bool within_a; + size_t idx_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc new file mode 100644 index 000000000..abdecca9f --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.cc @@ -0,0 +1,1234 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker.h" + +#include <cinttypes> +#include <limits> +#include <queue> +#include <string> +#include <utility> +#include <vector> + +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno) { + // Do not pick ingested file when there is at least one memtable not flushed + // which of seqno is overlap with the sst. + TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; + for (; start < level_files.size(); start++) { + if (level_files[start]->being_compacted) { + return false; + } + // If there is no data in memtable, the earliest sequence number would the + // largest sequence number in last memtable. + // Because all files are sorted in descending order by largest_seqno, so we + // only need to check the first one. + if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { + break; + } + } + if (start >= level_files.size()) { + return false; + } + size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size); + size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max(); + // Compaction range will be [start, limit). + size_t limit; + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. + size_t new_compact_bytes_per_del_file = 0; + for (limit = start + 1; limit < level_files.size(); ++limit) { + compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size); + new_compact_bytes_per_del_file = compact_bytes / (limit - start); + if (level_files[limit]->being_compacted || + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compact_bytes > max_compaction_bytes) { + break; + } + compact_bytes_per_del_file = new_compact_bytes_per_del_file; + } + + if ((limit - start) >= min_files_to_compact && + compact_bytes_per_del_file < max_compact_bytes_per_del_file) { + assert(comp_inputs != nullptr); + comp_inputs->level = 0; + for (size_t i = start; i < limit; ++i) { + comp_inputs->files.push_back(level_files[i]); + } + return true; + } + return false; +} + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use it. + if (mutable_cf_options.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1)) { + return mutable_cf_options.bottommost_compression; + } + // If the user has specified a different compression level for each level, + // then pick the compression for that level. + if (!mutable_cf_options.compression_per_level.empty()) { + assert(level == 0 || level >= base_level); + int idx = (level == 0) ? 0 : level - base_level + 1; + + const int n = + static_cast<int>(mutable_cf_options.compression_per_level.size()) - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level is beyond the end of the + // specified compression levels, use the last value. + return mutable_cf_options + .compression_per_level[std::max(0, std::min(idx, n))]; + } else { + return mutable_cf_options.compression; + } +} + +CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression) { + if (!enable_compression) { + return cf_options.compression_opts; + } + // If bottommost_compression_opts is enabled and we are compacting to the + // bottommost level then we should use the specified compression options. + if (level >= (vstorage->num_non_empty_levels() - 1) && + cf_options.bottommost_compression_opts.enabled) { + return cf_options.bottommost_compression_opts; + } + return cf_options.compression_opts; +} + +CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : ioptions_(ioptions), icmp_(icmp) {} + +CompactionPicker::~CompactionPicker() {} + +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + UnregisterCompaction(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs, + InternalKey* smallest, + InternalKey* largest) const { + const int level = inputs.level; + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + + if (level == 0) { + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } + } else { + *smallest = inputs[0]->smallest; + *largest = inputs[inputs.size() - 1]->largest; + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, + InternalKey* smallest, + InternalKey* largest) const { + assert(!inputs1.empty() || !inputs2.empty()); + if (inputs1.empty()) { + GetRange(inputs2, smallest, largest); + } else if (inputs2.empty()) { + GetRange(inputs1, smallest, largest); + } else { + InternalKey smallest1, smallest2, largest1, largest2; + GetRange(inputs1, &smallest1, &largest1); + GetRange(inputs2, &smallest2, &largest2); + *smallest = + icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2; + *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1; + } +} + +void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs, + InternalKey* smallest, InternalKey* largest, + int exclude_level) const { + InternalKey current_smallest; + InternalKey current_largest; + bool initialized = false; + for (const auto& in : inputs) { + if (in.empty() || in.level == exclude_level) { + continue; + } + GetRange(in, ¤t_smallest, ¤t_largest); + if (!initialized) { + *smallest = current_smallest; + *largest = current_largest; + initialized = true; + } else { + if (icmp_->Compare(current_smallest, *smallest) < 0) { + *smallest = current_smallest; + } + if (icmp_->Compare(current_largest, *largest) > 0) { + *largest = current_largest; + } + } + } + assert(initialized); +} + +bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest) { + // This isn't good compaction + assert(!inputs->empty()); + + const int level = inputs->level; + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (level == 0) { + return true; + } + + InternalKey smallest, largest; + + // Keep expanding inputs until we are sure that there is a "clean cut" + // boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = inputs->size(); + GetRange(*inputs, &smallest, &largest); + inputs->clear(); + vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, + hint_index, &hint_index, true, + next_smallest); + } while (inputs->size() > old_size); + + // we started off with inputs non-empty and the previous loop only grew + // inputs. thus, inputs should be non-empty here + assert(!inputs->empty()); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + if (AreFilesInCompaction(inputs->files)) { + return false; + } + return true; +} + +bool CompactionPicker::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + const Comparator* ucmp = icmp_->user_comparator(); + for (Compaction* c : compactions_in_progress_) { + if (c->output_level() == level && + ucmp->CompareWithoutTimestamp(smallest_user_key, + c->GetLargestUserKey()) <= 0 && + ucmp->CompareWithoutTimestamp(largest_user_key, + c->GetSmallestUserKey()) >= 0) { + // Overlap + return true; + } + if (c->SupportsPerKeyPlacement()) { + if (c->OverlapPenultimateLevelOutputRange(smallest_user_key, + largest_user_key)) { + return true; + } + } + } + // Did not overlap with any running compaction in level `level` + return false; +} + +bool CompactionPicker::FilesRangeOverlapWithCompaction( + const std::vector<CompactionInputFiles>& inputs, int level, + int penultimate_level) const { + bool is_empty = true; + for (auto& in : inputs) { + if (!in.empty()) { + is_empty = false; + break; + } + } + if (is_empty) { + // No files in inputs + return false; + } + + // TODO: Intra L0 compactions can have the ranges overlapped, but the input + // files cannot be overlapped in the order of L0 files. + InternalKey smallest, largest; + GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel); + if (penultimate_level != Compaction::kInvalidLevel) { + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + penultimate_level)) { + return true; + } + } else { + InternalKey penultimate_smallest, penultimate_largest; + GetRange(inputs, &penultimate_smallest, &penultimate_largest, level); + if (RangeOverlapWithCompaction(penultimate_smallest.user_key(), + penultimate_largest.user_key(), + penultimate_level)) { + return true; + } + } + } + + return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + level); +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::AreFilesInCompaction( + const std::vector<FileMetaData*>& files) { + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +Compaction* CompactionPicker::CompactFiles( + const CompactionOptions& compact_options, + const std::vector<CompactionInputFiles>& input_files, int output_level, + VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, uint32_t output_path_id) { +#ifndef NDEBUG + assert(input_files.size()); + // This compaction output should not overlap with a running compaction as + // `SanitizeCompactionInputFiles` should've checked earlier and db mutex + // shouldn't have been released since. + int start_level = Compaction::kInvalidLevel; + for (const auto& in : input_files) { + // input_files should already be sorted by level + if (!in.empty()) { + start_level = in.level; + break; + } + } + assert(output_level == 0 || + !FilesRangeOverlapWithCompaction( + input_files, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, + start_level, output_level))); +#endif /* !NDEBUG */ + + CompressionType compression_type; + if (compact_options.compression == kDisableCompressionOption) { + int base_level; + if (ioptions_.compaction_style == kCompactionStyleLevel) { + base_level = vstorage->base_level(); + } else { + base_level = 1; + } + compression_type = GetCompressionType(vstorage, mutable_cf_options, + output_level, base_level); + } else { + // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType` + // without configurable `CompressionOptions`, which is inconsistent. + compression_type = compact_options.compression; + } + auto c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files, + output_level, compact_options.output_file_size_limit, + mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_options.max_subcompactions, + /* grandparents */ {}, true); + RegisterCompaction(c); + return c; +} + +Status CompactionPicker::GetCompactionInputsFromFileNumbers( + std::vector<CompactionInputFiles>* input_files, + std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage, + const CompactionOptions& /*compact_options*/) const { + if (input_set->size() == 0U) { + return Status::InvalidArgument( + "Compaction must include at least one file."); + } + assert(input_files); + + std::vector<CompactionInputFiles> matched_input_files; + matched_input_files.resize(vstorage->num_levels()); + int first_non_empty_level = -1; + int last_non_empty_level = -1; + // TODO(yhchiang): use a lazy-initialized mapping from + // file_number to FileMetaData in Version. + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (auto file : vstorage->LevelFiles(level)) { + auto iter = input_set->find(file->fd.GetNumber()); + if (iter != input_set->end()) { + matched_input_files[level].files.push_back(file); + input_set->erase(iter); + last_non_empty_level = level; + if (first_non_empty_level == -1) { + first_non_empty_level = level; + } + } + } + } + + if (!input_set->empty()) { + std::string message( + "Cannot find matched SST files for the following file numbers:"); + for (auto fn : *input_set) { + message += " "; + message += std::to_string(fn); + } + return Status::InvalidArgument(message); + } + + for (int level = first_non_empty_level; level <= last_non_empty_level; + ++level) { + matched_input_files[level].level = level; + input_files->emplace_back(std::move(matched_input_files[level])); + } + + return Status::OK(); +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* level_index) { + std::vector<FileMetaData*> inputs; + assert(level < NumberLevels()); + + vstorage->GetOverlappingInputs(level, smallest, largest, &inputs, + level_index ? *level_index : 0, level_index); + return AreFilesInCompaction(inputs); +} + +// Populates the set of inputs of all other levels that overlap with the +// start level. +// Now we assume all levels except start level and output level are empty. +// Will also attempt to expand "start level" if that doesn't expand +// "output level" or cause "level" to include a file for compaction that has an +// overlapping user-key with another file. +// REQUIRES: input_level and output_level are different +// REQUIRES: inputs->empty() == false +// Returns false if files on parent level are currently in compaction, which +// means that we can't compact them +bool CompactionPicker::SetupOtherInputs( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, int* parent_index, + int base_index, bool only_expand_towards_right) { + assert(!inputs->empty()); + assert(output_level_inputs->empty()); + const int input_level = inputs->level; + const int output_level = output_level_inputs->level; + if (input_level == output_level) { + // no possibility of conflict + return true; + } + + // For now, we only support merging two levels, start level and output level. + // We need to assert other levels are empty. + for (int l = input_level + 1; l < output_level; l++) { + assert(vstorage->NumLevelFiles(l) == 0); + } + + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(*inputs, &smallest, &largest); + + // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to + // include in compaction + vstorage->GetOverlappingInputs(output_level, &smallest, &largest, + &output_level_inputs->files, *parent_index, + parent_index); + if (AreFilesInCompaction(output_level_inputs->files)) { + return false; + } + if (!output_level_inputs->empty()) { + if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) { + return false; + } + } + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!output_level_inputs->empty()) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + const uint64_t output_level_inputs_size = + TotalFileSize(output_level_inputs->files); + const uint64_t inputs_size = TotalFileSize(inputs->files); + bool expand_inputs = false; + + CompactionInputFiles expanded_inputs; + expanded_inputs.level = input_level; + // Get closed interval of output level + InternalKey all_start, all_limit; + GetRange(*inputs, *output_level_inputs, &all_start, &all_limit); + bool try_overlapping_inputs = true; + if (only_expand_towards_right) { + // Round-robin compaction only allows expansion towards the larger side. + vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit, + &expanded_inputs.files, base_index, + nullptr); + } else { + vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit, + &expanded_inputs.files, base_index, + nullptr); + } + uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files); + if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) { + try_overlapping_inputs = false; + } + if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && + !AreFilesInCompaction(expanded_inputs.files)) { + InternalKey new_start, new_limit; + GetRange(expanded_inputs, &new_start, &new_limit); + CompactionInputFiles expanded_output_level_inputs; + expanded_output_level_inputs.level = output_level; + vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit, + &expanded_output_level_inputs.files, + *parent_index, parent_index); + assert(!expanded_output_level_inputs.empty()); + if (!AreFilesInCompaction(expanded_output_level_inputs.files) && + ExpandInputsToCleanCut(cf_name, vstorage, + &expanded_output_level_inputs) && + expanded_output_level_inputs.size() == output_level_inputs->size()) { + expand_inputs = true; + } + } + if (!expand_inputs) { + vstorage->GetCleanInputsWithinInterval(input_level, &all_start, + &all_limit, &expanded_inputs.files, + base_index, nullptr); + expanded_inputs_size = TotalFileSize(expanded_inputs.files); + if (expanded_inputs.size() > inputs->size() && + (mutable_cf_options.ignore_max_compaction_bytes_for_input || + output_level_inputs_size + expanded_inputs_size < limit) && + !AreFilesInCompaction(expanded_inputs.files)) { + expand_inputs = true; + } + } + if (expand_inputs) { + ROCKS_LOG_INFO(ioptions_.logger, + "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt + "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt + "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", + cf_name.c_str(), input_level, inputs->size(), + output_level_inputs->size(), inputs_size, + output_level_inputs_size, expanded_inputs.size(), + output_level_inputs->size(), expanded_inputs_size, + output_level_inputs_size); + inputs->files = expanded_inputs.files; + } + } else { + // Likely to be trivial move. Expand files if they are still trivial moves, + // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that + // we don't create too much compaction pressure for the next level. + } + return true; +} + +void CompactionPicker::GetGrandparents( + VersionStorageInfo* vstorage, const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector<FileMetaData*>* grandparents) { + InternalKey start, limit; + GetRange(inputs, output_level_inputs, &start, &limit); + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2 or the first + // level after that has overlapping files) + for (int level = output_level_inputs.level + 1; level < NumberLevels(); + level++) { + vstorage->GetOverlappingInputs(level, &start, &limit, grandparents); + if (!grandparents->empty()) { + break; + } + } +} + +Compaction* CompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + // CompactionPickerFIFO has its own implementation of compact range + assert(ioptions_.compaction_style != kCompactionStyleFIFO); + + if (input_level == ColumnFamilyData::kCompactAllLevels) { + assert(ioptions_.compaction_style == kCompactionStyleUniversal); + + // Universal compaction with more than one level always compacts all the + // files together to the last level. + assert(vstorage->num_levels() > 1); + // DBImpl::CompactRange() set output level to be the last level + if (ioptions_.allow_ingest_behind) { + assert(output_level == vstorage->num_levels() - 2); + } else { + assert(output_level == vstorage->num_levels() - 1); + } + // DBImpl::RunManualCompaction will make full range for universal compaction + assert(begin == nullptr); + assert(end == nullptr); + *compaction_end = nullptr; + + int start_level = 0; + for (; start_level < vstorage->num_levels() && + vstorage->NumLevelFiles(start_level) == 0; + start_level++) { + } + if (start_level == vstorage->num_levels()) { + return nullptr; + } + + if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) { + *manual_conflict = true; + // Only one level 0 compaction allowed + return nullptr; + } + + std::vector<CompactionInputFiles> inputs(vstorage->num_levels() - + start_level); + for (int level = start_level; level < vstorage->num_levels(); level++) { + inputs[level - start_level].level = level; + auto& files = inputs[level - start_level].files; + for (FileMetaData* f : vstorage->LevelFiles(level)) { + files.push_back(f); + } + if (AreFilesInCompaction(files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, + start_level, output_level))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style), + /* max_compaction_bytes */ LLONG_MAX, + compact_range_options.target_path_id, + GetCompressionType(vstorage, mutable_cf_options, output_level, 1), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1, + /* deletion_compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kUnknown, + compact_range_options.blob_garbage_collection_policy, + compact_range_options.blob_garbage_collection_age_cutoff); + + RegisterCompaction(c); + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + return c; + } + + CompactionInputFiles inputs; + inputs.level = input_level; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + + vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files); + if (inputs.empty()) { + return nullptr; + } + + if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) { + // Only one level 0 compaction allowed + TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict"); + *manual_conflict = true; + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + uint64_t input_level_total = 0; + int hint_index = -1; + InternalKey* smallest = nullptr; + InternalKey* largest = nullptr; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + if (!smallest) { + smallest = &inputs[i]->smallest; + } + largest = &inputs[i]->largest; + + uint64_t input_file_size = inputs[i]->fd.GetFileSize(); + uint64_t output_level_total = 0; + if (output_level < vstorage->num_non_empty_levels()) { + std::vector<FileMetaData*> files; + vstorage->GetOverlappingInputsRangeBinarySearch( + output_level, smallest, largest, &files, hint_index, &hint_index); + for (const auto& file : files) { + output_level_total += file->fd.GetFileSize(); + } + } + + input_level_total += input_file_size; + + if (input_level_total + output_level_total >= limit) { + covering_the_whole_range = false; + // still include the current file, so the compaction could be larger + // than max_compaction_bytes, which is also to make sure the compaction + // can make progress even `max_compaction_bytes` is small (e.g. smaller + // than an SST file). + inputs.files.resize(i + 1); + break; + } + } + } + + assert(compact_range_options.target_path_id < + static_cast<uint32_t>(ioptions_.cf_paths.size())); + + // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out + // files that are created during the current compaction. + if (compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized && + max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) { + assert(input_level == output_level); + // inputs_shrunk holds a continuous subset of input files which were all + // created before the current manual compaction + std::vector<FileMetaData*> inputs_shrunk; + size_t skip_input_index = inputs.size(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + inputs_shrunk.push_back(inputs[i]); + } else if (!inputs_shrunk.empty()) { + // inputs[i] was created during the current manual compaction and + // need to be skipped + skip_input_index = i; + break; + } + } + if (inputs_shrunk.empty()) { + return nullptr; + } + if (inputs.size() != inputs_shrunk.size()) { + inputs.files.swap(inputs_shrunk); + } + // set covering_the_whole_range to false if there is any file that need to + // be compacted in the range of inputs[skip_input_index+1, inputs.size()) + for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + covering_the_whole_range = false; + } + } + } + + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { + // manual compaction is now multi-threaded, so it can + // happen that ExpandWhileOverlapping fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + + if (covering_the_whole_range || !next_smallest) { + *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; + } + + CompactionInputFiles output_level_inputs; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(input_level == 0); + output_level = vstorage->base_level(); + assert(output_level > 0); + } + output_level_inputs.level = output_level; + if (input_level != output_level) { + int parent_index = -1; + if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, + &output_level_inputs, &parent_index, -1)) { + // manual compaction is now multi-threaded, so it can + // happen that SetupOtherInputs fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + } + + std::vector<CompactionInputFiles> compaction_inputs({inputs}); + if (!output_level_inputs.empty()) { + compaction_inputs.push_back(output_level_inputs); + } + for (size_t i = 0; i < compaction_inputs.size(); i++) { + if (AreFilesInCompaction(compaction_inputs[i].files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction( + compaction_inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level, + output_level))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + std::vector<FileMetaData*> grandparents; + GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); + Compaction* compaction = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(compaction_inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style, vstorage->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options.max_compaction_bytes, + compact_range_options.target_path_id, + GetCompressionType(vstorage, mutable_cf_options, output_level, + vstorage->base_level()), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1, + /* deletion_compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kUnknown, + compact_range_options.blob_garbage_collection_policy, + compact_range_options.blob_garbage_collection_age_cutoff); + + TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); + RegisterCompaction(compaction); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + + return compaction; +} + +#ifndef ROCKSDB_LITE +namespace { +// Test whether two files have overlapping key-ranges. +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} +} // namespace + +Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( + std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + auto& levels = cf_meta.levels; + auto comparator = icmp_->user_comparator(); + + // TODO(yhchiang): add is_adjustable to CompactionOptions + + // the smallest and largest key of the current compaction input + std::string smallestkey; + std::string largestkey; + // a flag for initializing smallest and largest key + bool is_first = false; + const int kNotFound = -1; + + // For each level, it does the following things: + // 1. Find the first and the last compaction input files + // in the current level. + // 2. Include all files between the first and the last + // compaction input files. + // 3. Update the compaction key-range. + // 4. For all remaining levels, include files that have + // overlapping key-range with the compaction key-range. + for (int l = 0; l <= output_level; ++l) { + auto& current_files = levels[l].files; + int first_included = static_cast<int>(current_files.size()); + int last_included = kNotFound; + + // identify the first and the last compaction input files + // in the current level. + for (size_t f = 0; f < current_files.size(); ++f) { + const uint64_t file_number = TableFileNameToNumber(current_files[f].name); + if (input_files->find(file_number) == input_files->end()) { + continue; + } + first_included = std::min(first_included, static_cast<int>(f)); + last_included = std::max(last_included, static_cast<int>(f)); + if (is_first == false) { + smallestkey = current_files[f].smallestkey; + largestkey = current_files[f].largestkey; + is_first = true; + } + } + if (last_included == kNotFound) { + continue; + } + + if (l != 0) { + // expand the compaction input of the current level if it + // has overlapping key-range with other non-compaction input + // files in the same level. + while (first_included > 0) { + if (comparator->CompareWithoutTimestamp( + current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < 0) { + break; + } + first_included--; + } + + while (last_included < static_cast<int>(current_files.size()) - 1) { + if (comparator->CompareWithoutTimestamp( + current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { + break; + } + last_included++; + } + } else if (output_level > 0) { + last_included = static_cast<int>(current_files.size() - 1); + } + + // include all files between the first and the last compaction input files. + for (int f = first_included; f <= last_included; ++f) { + if (current_files[f].being_compacted) { + return Status::Aborted("Necessary compaction input file " + + current_files[f].name + + " is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(current_files[f].name)); + } + + // update smallest and largest key + if (l == 0) { + for (int f = first_included; f <= last_included; ++f) { + if (comparator->CompareWithoutTimestamp( + smallestkey, current_files[f].smallestkey) > 0) { + smallestkey = current_files[f].smallestkey; + } + if (comparator->CompareWithoutTimestamp( + largestkey, current_files[f].largestkey) < 0) { + largestkey = current_files[f].largestkey; + } + } + } else { + if (comparator->CompareWithoutTimestamp( + smallestkey, current_files[first_included].smallestkey) > 0) { + smallestkey = current_files[first_included].smallestkey; + } + if (comparator->CompareWithoutTimestamp( + largestkey, current_files[last_included].largestkey) < 0) { + largestkey = current_files[last_included].largestkey; + } + } + + SstFileMetaData aggregated_file_meta; + aggregated_file_meta.smallestkey = smallestkey; + aggregated_file_meta.largestkey = largestkey; + + // For all lower levels, include all overlapping files. + // We need to add overlapping files from the current level too because even + // if there no input_files in level l, we would still need to add files + // which overlap with the range containing the input_files in levels 0 to l + // Level 0 doesn't need to be handled this way because files are sorted by + // time and not by key + for (int m = std::max(l, 1); m <= output_level; ++m) { + for (auto& next_lv_file : levels[m].files) { + if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta, + next_lv_file)) { + if (next_lv_file.being_compacted) { + return Status::Aborted( + "File " + next_lv_file.name + + " that has overlapping key range with one of the compaction " + " input file is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(next_lv_file.name)); + } + } + } + } + if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) { + return Status::Aborted( + "A running compaction is writing to the same output level in an " + "overlapping key range"); + } + return Status::OK(); +} + +Status CompactionPicker::SanitizeCompactionInputFiles( + std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + assert(static_cast<int>(cf_meta.levels.size()) - 1 == + cf_meta.levels[cf_meta.levels.size() - 1].level); + if (output_level >= static_cast<int>(cf_meta.levels.size())) { + return Status::InvalidArgument( + "Output level for column family " + cf_meta.name + + " must between [0, " + + std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "]."); + } + + if (output_level > MaxOutputLevel()) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm --- " + + std::to_string(MaxOutputLevel())); + } + + if (output_level < 0) { + return Status::InvalidArgument("Output level cannot be negative."); + } + + if (input_files->size() == 0) { + return Status::InvalidArgument( + "A compaction must contain at least one file."); + } + + Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta, + output_level); + + if (!s.ok()) { + return s; + } + + // for all input files, check whether the file number matches + // any currently-existing files. + for (auto file_num : *input_files) { + bool found = false; + int input_file_level = -1; + for (const auto& level_meta : cf_meta.levels) { + for (const auto& file_meta : level_meta.files) { + if (file_num == TableFileNameToNumber(file_meta.name)) { + if (file_meta.being_compacted) { + return Status::Aborted("Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); + } + found = true; + input_file_level = level_meta.level; + break; + } + } + if (found) { + break; + } + } + if (!found) { + return Status::InvalidArgument( + "Specified compaction input file " + MakeTableFileName("", file_num) + + " does not exist in column family " + cf_meta.name + "."); + } + if (input_file_level > output_level) { + return Status::InvalidArgument( + "Cannot compact file to up level, input file: " + + MakeTableFileName("", file_num) + " level " + + std::to_string(input_file_level) + " > output level " + + std::to_string(output_level)); + } + } + + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +void CompactionPicker::RegisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + assert(ioptions_.compaction_style != kCompactionStyleLevel || + c->output_level() == 0 || + !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(), + c->GetPenultimateLevel())); + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.insert(c); + } + compactions_in_progress_.insert(c); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered", + c); +} + +void CompactionPicker::UnregisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.erase(c); + } + compactions_in_progress_.erase(c); +} + +void CompactionPicker::PickFilesMarkedForCompaction( + const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, + int* output_level, CompactionInputFiles* start_level_inputs) { + if (vstorage->FilesMarkedForCompaction().empty()) { + return; + } + + auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + *start_level = level_file.first; + *output_level = + (*start_level == 0) ? vstorage->base_level() : *start_level + 1; + + if (*start_level == 0 && !level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs->files = {level_file.second}; + start_level_inputs->level = *start_level; + return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs); + }; + + // take a chance on a random file first + Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage)); + size_t random_file_index = static_cast<size_t>(rnd.Uniform( + static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size()))); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction", + &random_file_index); + + if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) { + // found the compaction! + return; + } + + for (auto& level_file : vstorage->FilesMarkedForCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + start_level_inputs->files.clear(); +} + +bool CompactionPicker::GetOverlappingL0Files( + VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index) { + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + assert(level0_compactions_in_progress()->empty()); + InternalKey smallest, largest; + GetRange(*start_level_inputs, &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + start_level_inputs->files.clear(); + vstorage->GetOverlappingInputs(0, &smallest, &largest, + &(start_level_inputs->files)); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(*start_level_inputs, &smallest, &largest); + if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level, + parent_index)) { + return false; + } + assert(!start_level_inputs->files.empty()); + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h new file mode 100644 index 000000000..7739dd96b --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.h @@ -0,0 +1,323 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <memory> +#include <set> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/compaction/compaction.h" +#include "db/version_set.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + +class LogBuffer; +class Compaction; +class VersionStorageInfo; +struct CompactionInputFiles; + +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. +class CompactionPicker { + public: + CompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts); + + // The maximum allowed output level. Default value is NumberLevels() - 1. + virtual int MaxOutputLevel() const { return NumberLevels() - 1; } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; + +// Sanitize the input set of compaction input files. +// When the input parameters do not describe a valid compaction, the +// function will try to fix the input_files by adding necessary +// files. If it's not possible to conver an invalid input_files +// into a valid one by adding more files, the function will return a +// non-ok status with specific reason. +#ifndef ROCKSDB_LITE + Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; +#endif // ROCKSDB_LITE + + // Free up the files that participated in a compaction + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Returns true if any one of the specified files are being compacted + bool AreFilesInCompaction(const std::vector<FileMetaData*>& files); + + // Takes a list of CompactionInputFiles and returns a (manual) Compaction + // object. + // + // Caller must provide a set of input files that has been passed through + // `SanitizeCompactionInputFiles` earlier. The lock should not be released + // between that call and this one. + Compaction* CompactFiles(const CompactionOptions& compact_options, + const std::vector<CompactionInputFiles>& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + uint32_t output_path_id); + + // Converts a set of compaction input file numbers into + // a list of CompactionInputFiles. + Status GetCompactionInputsFromFileNumbers( + std::vector<CompactionInputFiles>* input_files, + std::unordered_set<uint64_t>* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const; + + // Is there currently a compaction involving level 0 taking place + bool IsLevel0CompactionInProgress() const { + return !level0_compactions_in_progress_.empty(); + } + + // Return true if the passed key range overlap with a compaction output + // that is currently running. + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs + // in *smallest, *largest. + // REQUIRES: inputs is not empty (at least on entry have one file) + void GetRange(const std::vector<CompactionInputFiles>& inputs, + InternalKey* smallest, InternalKey* largest, + int exclude_level) const; + + int NumberLevels() const { return ioptions_.num_levels; } + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandInputsToCleanCut(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); + + // Returns true if any one of the parent files are being compacted + bool IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, int level, int* index); + + // Returns true if the key range that `inputs` files cover overlap with the + // key range of a currently running compaction. + bool FilesRangeOverlapWithCompaction( + const std::vector<CompactionInputFiles>& inputs, int level, + int penultimate_level) const; + + bool SetupOtherInputs(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, + int* parent_index, int base_index, + bool only_expand_towards_right = false); + + void GetGrandparents(VersionStorageInfo* vstorage, + const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector<FileMetaData*>* grandparents); + + void PickFilesMarkedForCompaction(const std::string& cf_name, + VersionStorageInfo* vstorage, + int* start_level, int* output_level, + CompactionInputFiles* start_level_inputs); + + bool GetOverlappingL0Files(VersionStorageInfo* vstorage, + CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index); + + // Register this compaction in the set of running compactions + void RegisterCompaction(Compaction* c); + + // Remove this compaction from the set of running compactions + void UnregisterCompaction(Compaction* c); + + std::set<Compaction*>* level0_compactions_in_progress() { + return &level0_compactions_in_progress_; + } + std::unordered_set<Compaction*>* compactions_in_progress() { + return &compactions_in_progress_; + } + + const InternalKeyComparator* icmp() const { return icmp_; } + + protected: + const ImmutableOptions& ioptions_; + +// A helper function to SanitizeCompactionInputFiles() that +// sanitizes "input_files" by adding necessary files. +#ifndef ROCKSDB_LITE + virtual Status SanitizeCompactionInputFilesForAllLevels( + std::unordered_set<uint64_t>* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const; +#endif // ROCKSDB_LITE + + // Keeps track of all compactions that are running on Level0. + // Protected by DB mutex + std::set<Compaction*> level0_compactions_in_progress_; + + // Keeps track of all compactions that are running. + // Protected by DB mutex + std::unordered_set<Compaction*> compactions_in_progress_; + + const InternalKeyComparator* const icmp_; +}; + +#ifndef ROCKSDB_LITE +// A dummy compaction that never triggers any automatic +// compaction. +class NullCompactionPicker : public CompactionPicker { + public: + NullCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual ~NullCompactionPicker() {} + + // Always return "nullptr" + Compaction* PickCompaction( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, + SequenceNumber /* earliest_memtable_seqno */) override { + return nullptr; + } + + // Always return "nullptr" + Compaction* CompactRange(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, + int /*input_level*/, int /*output_level*/, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, + const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, + bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, + const std::string& /*trim_ts*/) override { + return nullptr; + } + + // Always returns false. + virtual bool NeedsCompaction( + const VersionStorageInfo* /*vstorage*/) const override { + return false; + } +}; +#endif // !ROCKSDB_LITE + +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. +bool FindIntraL0Compaction( + const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); + +CompressionType GetCompressionType(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression = true); + +CompressionOptions GetCompressionOptions( + const MutableCFOptions& mutable_cf_options, + const VersionStorageInfo* vstorage, int level, + const bool enable_compression = true); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc new file mode 100644 index 000000000..1f875e3e1 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc @@ -0,0 +1,433 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_fifo.h" +#ifndef ROCKSDB_LITE + +#include <cinttypes> +#include <string> +#include <vector> + +#include "db/column_family.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) { + uint64_t total_size = 0; + for (const auto& f : files) { + total_size += f->fd.file_size; + } + return total_size; +} +} // anonymous namespace + +bool FIFOCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + +Compaction* FIFOCompactionPicker::PickTTLCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + assert(mutable_cf_options.ttl > 0); + + const int kLevel0 = 0; + const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on TTL. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast<uint64_t>(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector<CompactionInputFiles> inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.ttl) { + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + uint64_t creation_time = + f->fd.table_reader->GetTableProperties()->creation_time; + if (creation_time == 0 || + creation_time >= (current_time - mutable_cf_options.ttl)) { + break; + } + } + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + } + } + + // Return a nullptr and proceed to size-based FIFO compaction if: + // 1. there are no files older than ttl OR + // 2. there are a few files older than ttl, but deleting them will not bring + // the total size to be less than max_table_files_size threshold. + if (inputs[0].files.empty() || + total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + return nullptr; + } + + for (const auto& f : inputs[0].files) { + uint64_t creation_time = 0; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + creation_time = f->fd.table_reader->GetTableProperties()->creation_time; + } + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with creation time %" PRIu64 " for deletion", + cf_name.c_str(), f->fd.GetNumber(), creation_time); + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0, 0, 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ true, /* l0_files_might_overlap */ true, + CompactionReason::kFIFOTtl); + return c; +} + +// The size-based compaction picker for FIFO. +// +// When the entire column family size exceeds max_table_files_size, FIFO will +// try to delete the oldest sst file(s) until the resulting column family size +// is smaller than max_table_files_size. +// +// This function also takes care the case where a DB is migrating from level / +// universal compaction to FIFO compaction. During the migration, the column +// family will also have non-L0 files while FIFO can only create L0 files. +// In this case, this function will first purge the sst files in the bottom- +// most non-empty level first, and the DB will eventually converge to the +// regular FIFO case where there're only L0 files. Note that during the +// migration case, the purge order will only be an approximation of "FIFO" +// as entries inside lower-level files might sometimes be newer than some +// entries inside upper-level files. +Compaction* FIFOCompactionPicker::PickSizeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + // compute the total size and identify the last non-empty level + int last_level = 0; + uint64_t total_size = 0; + for (int level = 0; level < vstorage->num_levels(); ++level) { + auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level)); + total_size += level_size; + if (level_size > 0) { + last_level = level; + } + } + const std::vector<FileMetaData*>& last_level_files = + vstorage->LevelFiles(last_level); + + if (last_level == 0 && + total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + // total size not exceeded, try to find intra level 0 compaction if enabled + const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0); + if (mutable_cf_options.compaction_options_fifo.allow_compaction && + level0_files.size() > 0) { + CompactionInputFiles comp_inputs; + // try to prevent same files from being compacted multiple times, which + // could produce large files that may never TTL-expire. Achieve this by + // disallowing compactions with files larger than memtable (inflate its + // size by 10% to account for uncompressed L0 files that may have size + // slightly greater than memtable size limit). + size_t max_compact_bytes_per_del_file = + static_cast<size_t>(MultiplyCheckOverflow( + static_cast<uint64_t>(mutable_cf_options.write_buffer_size), + 1.1)); + if (FindIntraL0Compaction( + level0_files, + mutable_cf_options + .level0_file_num_compaction_trigger /* min_files_to_compact */ + , + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, + 0 /* output path ID */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions */, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ false, + /* l0_files_might_overlap */ true, + CompactionReason::kFIFOReduceNumFiles); + return c; + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; + } + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector<CompactionInputFiles> inputs; + inputs.emplace_back(); + inputs[0].level = last_level; + + if (last_level == 0) { + // In L0, right-most files are the oldest files. + for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend(); + ++ritr) { + auto f = *ritr; + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + } else { + // If the last level is non-L0, we actually don't know which file is + // logically the oldest since the file creation time only represents + // when this file was compacted to this level, which is independent + // to when the entries in this file were first inserted. + // + // As a result, we delete files from the left instead. This means the sst + // file with the smallest key will be deleted first. This design decision + // better serves a major type of FIFO use cases where smaller keys are + // associated with older data. + for (const auto& f : last_level_files) { + total_size -= f->fd.file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), last_level, + /* target_file_size */ 0, + /* max_compaction_bytes */ 0, + /* output_path_id */ 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* is deletion compaction */ true, + /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompactionToWarm( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) { + return nullptr; + } + + // PickCompactionToWarm is only triggered if there is no non-L0 files. + for (int level = 1; level < vstorage->num_levels(); ++level) { + if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) { + return nullptr; + } + } + + const int kLevel0 = 0; + const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0); + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on warm threshold. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast<uint64_t>(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. Parallel " + "compactions are not supported", + cf_name.c_str()); + return nullptr; + } + + std::vector<CompactionInputFiles> inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) { + uint64_t create_time_threshold = + current_time - mutable_cf_options.compaction_options_fifo.age_for_warm; + uint64_t compaction_size = 0; + // We will ideally identify a file qualifying for warm tier by knowing + // the timestamp for the youngest entry in the file. However, right now + // we don't have the information. We infer it by looking at timestamp + // of the next file's (which is just younger) oldest entry's timestamp. + FileMetaData* prev_file = nullptr; + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + assert(f); + if (f->being_compacted) { + // Right now this probably won't happen as we never try to schedule + // two compactions in parallel, so here we just simply don't schedule + // anything. + return nullptr; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time == kUnknownOldestAncesterTime) { + // Older files might not have enough information. It is possible to + // handle these files by looking at newer files, but maintaining the + // logic isn't worth it. + break; + } + if (oldest_ancester_time > create_time_threshold) { + // The previous file (which has slightly older data) doesn't qualify + // for warm tier. + break; + } + if (prev_file != nullptr) { + compaction_size += prev_file->fd.GetFileSize(); + if (compaction_size > mutable_cf_options.max_compaction_bytes) { + break; + } + inputs[0].files.push_back(prev_file); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with next file's oldest time %" PRIu64 " for warm", + cf_name.c_str(), prev_file->fd.GetNumber(), + oldest_ancester_time); + } + if (f->temperature == Temperature::kUnknown || + f->temperature == Temperature::kHot) { + prev_file = f; + } else if (!inputs[0].files.empty()) { + // A warm file newer than files picked. + break; + } else { + assert(prev_file == nullptr); + } + } + } + + if (inputs[0].files.empty()) { + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, + mutable_cf_options.compression, mutable_cf_options.compression_opts, + Temperature::kWarm, + /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", + vstorage->CompactionScore(0), + /* is deletion compaction */ false, /* l0_files_might_overlap */ true, + CompactionReason::kChangeTemperature); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) { + Compaction* c = nullptr; + if (mutable_cf_options.ttl > 0) { + c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + RegisterCompaction(c); + return c; +} + +Compaction* FIFOCompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, const InternalKey* /*end*/, + InternalKey** compaction_end, bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) { +#ifdef NDEBUG + (void)input_level; + (void)output_level; +#endif + assert(input_level == 0); + assert(output_level == 0); + *compaction_end = nullptr; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); + Compaction* c = PickCompaction(cf_name, mutable_cf_options, + mutable_db_options, vstorage, &log_buffer); + log_buffer.FlushBufferToLog(); + return c; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h new file mode 100644 index 000000000..544259f38 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class FIFOCompactionPicker : public CompactionPicker { + public: + FIFOCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts) override; + + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { return 0; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; + + private: + Compaction* PickTTLCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickSizeCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickCompactionToWarm(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc new file mode 100644 index 000000000..b689b6add --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.cc @@ -0,0 +1,841 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_level.h" + +#include <string> +#include <utility> +#include <vector> + +#include "db/version_edit.h" +#include "logging/log_buffer.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForForcedBlobGC().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + SequenceNumber earliest_mem_seqno, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableOptions& ioptions, + const MutableDBOptions& mutable_db_options) + : cf_name_(cf_name), + vstorage_(vstorage), + earliest_mem_seqno_(earliest_mem_seqno), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions), + mutable_db_options_(mutable_db_options) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Compaction with round-robin compaction priority allows more files to be + // picked to form a large compaction + void SetupOtherFilesWithRoundRobinExpansion(); + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // For the specfied level, pick a file that we want to compact. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one for + // all compaction priorities except round-robin. For round-robin, + // multiple consecutive files may be put into inputs->files. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // Return true if a L0 trivial move is picked up. + bool TryPickL0TrivialMove(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + // Return true if TrivialMove is extended. `start_index` is the index of + // the intiial file picked, which should already be in `start_level_inputs_`. + bool TryExtendNonL0TrivialMove(int start_index); + + // Picks a file from level_files to compact. + // level_files is a vector of (level, file metadata) in ascending order of + // level. If compact_to_next_level is true, compact the file to the next + // level, otherwise, compact to the same level as the input file. + void PickFileToCompact( + const autovector<std::pair<int, FileMetaData*>>& level_files, + bool compact_to_next_level); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + SequenceNumber earliest_mem_seqno_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + bool is_l0_trivial_move_ = false; + CompactionInputFiles start_level_inputs_; + std::vector<CompactionInputFiles> compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector<FileMetaData*> grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableOptions& ioptions_; + const MutableDBOptions& mutable_db_options_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickFileToCompact( + const autovector<std::pair<int, FileMetaData*>>& level_files, + bool compact_to_next_level) { + for (auto& level_file : level_files) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + if ((compact_to_next_level && + start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + continue; + } + if (compact_to_next_level) { + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + } else { + output_level_ = start_level_; + } + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + return; + } + } + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + if (PickFileToCompact()) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } else { + // Compaction scores are sorted in descending order, no further scores + // will be >= 1. + break; + } + } + if (!start_level_inputs_.empty()) { + return; + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + + // Bottommost Files Compaction on deleting tombstones + PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + + // TTL Compaction + if (ioptions_.compaction_pri == kRoundRobin && + !vstorage_->ExpiredTtlFiles().empty()) { + auto expired_files = vstorage_->ExpiredTtlFiles(); + // the expired files list should already be sorted by level + start_level_ = expired_files.front().first; +#ifndef NDEBUG + for (const auto& file : expired_files) { + assert(start_level_ <= file.first); + } +#endif + if (start_level_ > 0) { + output_level_ = start_level_ + 1; + if (PickFileToCompact()) { + compaction_reason_ = CompactionReason::kRoundRobinTtl; + return; + } + } + } + + PickFileToCompact(vstorage_->ExpiredTtlFiles(), true); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + + // Periodic Compaction + PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + + // Forced blob garbage collection + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kForcedBlobGC; + return; + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() { + // We only expand when the start level is not L0 under round robin + assert(start_level_ >= 1); + + // For round-robin compaction priority, we have 3 constraints when picking + // multiple files. + // Constraint 1: We can only pick consecutive files + // -> Constraint 1a: When a file is being compacted (or some input files + // are being compacted after expanding, we cannot + // choose it and have to stop choosing more files + // -> Constraint 1b: When we reach the last file (with largest keys), we + // cannot choose more files (the next file will be the + // first one) + // Constraint 2: We should ensure the total compaction bytes (including the + // overlapped files from the next level) is no more than + // mutable_cf_options_.max_compaction_bytes + // Constraint 3: We try our best to pick as many files as possible so that + // the post-compaction level size is less than + // MaxBytesForLevel(start_level_) + // Constraint 4: We do not expand if it is possible to apply a trivial move + // Constraint 5 (TODO): Try to pick minimal files to split into the target + // number of subcompactions + TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin"); + + // Only expand the inputs when we have selected a file in start_level_inputs_ + if (start_level_inputs_.size() == 0) return; + + uint64_t start_lvl_bytes_no_compacting = 0; + uint64_t curr_bytes_to_compact = 0; + uint64_t start_lvl_max_bytes_to_compact = 0; + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(start_level_); + // Constraint 3 (pre-calculate the ideal max bytes to compact) + for (auto f : level_files) { + if (!f->being_compacted) { + start_lvl_bytes_no_compacting += f->fd.GetFileSize(); + } + } + if (start_lvl_bytes_no_compacting > + vstorage_->MaxBytesForLevel(start_level_)) { + start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting - + vstorage_->MaxBytesForLevel(start_level_); + } + + size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0]; + InternalKey smallest, largest; + // Constraint 4 (No need to check again later) + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + if (TryExtendNonL0TrivialMove((int)start_index)) { + return; + } + } + // Constraint 3 + if (start_level_inputs_[0]->fd.GetFileSize() >= + start_lvl_max_bytes_to_compact) { + return; + } + CompactionInputFiles tmp_start_level_inputs; + tmp_start_level_inputs = start_level_inputs_; + // TODO (zichen): Future parallel round-robin may also need to update this + // Constraint 1b (only expand till the end) + for (size_t i = start_index + 1; i < level_files.size(); i++) { + auto* f = level_files[i]; + if (f->being_compacted) { + // Constraint 1a + return; + } + + tmp_start_level_inputs.files.push_back(f); + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &tmp_start_level_inputs) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {tmp_start_level_inputs}, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // Constraint 1a + tmp_start_level_inputs.clear(); + return; + } + + curr_bytes_to_compact = 0; + for (auto start_lvl_f : tmp_start_level_inputs.files) { + curr_bytes_to_compact += start_lvl_f->fd.GetFileSize(); + } + + // Check whether any output level files are locked + compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest); + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + // Constraint 1a + tmp_start_level_inputs.clear(); + return; + } + + uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact; + for (auto output_lvl_f : output_level_inputs.files) { + curr_bytes_to_compact += output_lvl_f->fd.GetFileSize(); + } + if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) { + // Constraint 2 + tmp_start_level_inputs.clear(); + return; + } + + start_level_inputs_.files = tmp_start_level_inputs.files; + // Constraint 3 + if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) { + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + bool round_robin_expanding = + ioptions_.compaction_pri == kRoundRobin && + compaction_reason_ == CompactionReason::kLevelMaxLevelSize; + if (round_robin_expanding) { + SetupOtherFilesWithRoundRobinExpansion(); + } + if (!is_l0_trivial_move_ && + !compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_, + round_robin_expanding)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + if (!is_l0_trivial_move_) { + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction( + compaction_inputs_, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(compaction_inputs_), output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(vstorage_, mutable_cf_options_, output_level_, + vstorage_->base_level()), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), + Temperature::kUnknown, + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_, + compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/main/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast<uint64_t>( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast<uint64_t>( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::TryPickL0TrivialMove() { + if (vstorage_->base_level() <= 0) { + return false; + } + if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() && + !vstorage_->LevelFiles(output_level_).empty() && + ioptions_.db_paths.size() <= 1) { + // Try to pick trivial move from L0 to L1. We start from the oldest + // file. We keep expanding to newer files if it would form a + // trivial move. + // For now we don't support it with + // mutable_cf_options_.compression_per_level to prevent the logic + // of determining whether L0 can be trivial moved to the next level. + // We skip the case where output level is empty, since in this case, at + // least the oldest file would qualify for trivial move, and this would + // be a surprising behavior with few benefits. + + // We search from the oldest file from the newest. In theory, there are + // files in the middle can form trivial move too, but it is probably + // uncommon and we ignore these cases for simplicity. + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(start_level_); + + InternalKey my_smallest, my_largest; + for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) { + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + FileMetaData* file = *it; + if (it == level_files.rbegin()) { + my_smallest = file->smallest; + my_largest = file->largest; + } else { + if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) < + 0) { + my_smallest = file->smallest; + } else if (compaction_picker_->icmp()->Compare(file->smallest, + my_largest) > 0) { + my_largest = file->largest; + } else { + break; + } + } + vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + assert(!file->being_compacted); + start_level_inputs_.files.push_back(file); + } else { + break; + } + } + } + + if (!start_level_inputs_.empty()) { + // Sort files by key range. Not sure it's 100% necessary but it's cleaner + // to always keep files sorted by key the key ranges don't overlap. + std::sort(start_level_inputs_.files.begin(), + start_level_inputs_.files.end(), + [icmp = compaction_picker_->icmp()](FileMetaData* f1, + FileMetaData* f2) -> bool { + return (icmp->Compare(f1->smallest, f2->smallest) < 0); + }); + + is_l0_trivial_move_ = true; + return true; + } + return false; +} + +bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { + if (start_level_inputs_.size() == 1 && + (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) && + (mutable_cf_options_.compression_per_level.empty())) { + // Only file of `index`, and it is likely a trivial move. Try to + // expand if it is still a trivial move, but not beyond + // max_compaction_bytes or 4 files, so that we don't create too + // much compaction pressure for the next level. + // Ignore if there are more than one DB path, as it would be hard + // to predict whether it is a trivial move. + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(start_level_); + const size_t kMaxMultiTrivialMove = 4; + FileMetaData* initial_file = start_level_inputs_.files[0]; + size_t total_size = initial_file->fd.GetFileSize(); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + for (int i = start_index + 1; + i < static_cast<int>(level_files.size()) && + start_level_inputs_.size() < kMaxMultiTrivialMove; + i++) { + FileMetaData* next_file = level_files[i]; + if (next_file->being_compacted) { + break; + } + vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest), + &(next_file->largest), + &output_level_inputs.files); + if (!output_level_inputs.empty()) { + break; + } + if (i < static_cast<int>(level_files.size()) - 1 && + compaction_picker_->icmp() + ->user_comparator() + ->CompareWithoutTimestamp( + next_file->largest.user_key(), + level_files[i + 1]->smallest.user_key()) == 0) { + TEST_SYNC_POINT_CALLBACK( + "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut", + nullptr); + // Not a clean up after adding the next file. Skip. + break; + } + total_size += next_file->fd.GetFileSize(); + if (total_size > mutable_cf_options_.max_compaction_bytes) { + break; + } + start_level_inputs_.files.push_back(next_file); + } + return start_level_inputs_.size() > 1; + } + return false; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + start_level_inputs_.level = start_level_; + + assert(start_level_ >= 0); + + if (TryPickL0TrivialMove()) { + return true; + } + + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(start_level_); + + // Pick the file with the highest score in this level that is not already + // being compacted. + const std::vector<int>& file_scores = + vstorage_->FilesByCompactionPri(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_scores.size(); cmp_idx++) { + int index = file_scores[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + if (ioptions_.compaction_pri == kRoundRobin) { + // TODO(zichen): this file may be involved in one compaction from + // an upper level, cannot advance the cursor for round-robin policy. + // Currently, we do not pick any file to compact in this case. We + // should fix this later to ensure a compaction is picked but the + // cursor shall not be advanced. + return false; + } + continue; + } + + start_level_inputs_.files.push_back(f); + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; + } + + // Now that input level is fully expanded, we check whether any output + // files are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (output_level_inputs.empty()) { + if (TryExtendNonL0TrivialMove(index)) { + break; + } + } else { + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + if (ioptions_.compaction_pri == kRoundRobin) { + return false; + } + continue; + } + } + + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + if (ioptions_.compaction_pri != kRoundRobin) { + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + } + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector<FileMetaData*>& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast<size_t>( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, + std::numeric_limits<uint64_t>::max(), + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_, earliest_mem_seqno_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) { + LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, + log_buffer, mutable_cf_options, ioptions_, + mutable_db_options); + return builder.PickCompaction(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h new file mode 100644 index 000000000..42a9b60a6 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc new file mode 100644 index 000000000..2e2e566c0 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_test.cc @@ -0,0 +1,3964 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <limits> +#include <string> +#include <utility> + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/compaction/file_pri.h" +#include "rocksdb/advanced_options.h" +#include "table/unique_id_impl.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CountingLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; } + size_t log_count; +}; + +class CompactionPickerTestBase : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableOptions ioptions_; + MutableCFOptions mutable_cf_options_; + MutableDBOptions mutable_db_options_; + LevelCompactionPicker level_compaction_picker; + std::string cf_name_; + CountingLogger logger_; + LogBuffer log_buffer_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::unique_ptr<VersionStorageInfo> vstorage_; + std::vector<std::unique_ptr<FileMetaData>> files_; + // does not own FileMetaData + std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_; + // input files to compaction process. + std::vector<CompactionInputFiles> input_files_; + int compaction_level_start_; + + explicit CompactionPickerTestBase(const Comparator* _ucmp) + : ucmp_(_ucmp), + icmp_(ucmp_), + options_(CreateOptions(ucmp_)), + ioptions_(options_), + mutable_cf_options_(options_), + mutable_db_options_(), + level_compaction_picker(ioptions_, &icmp_), + cf_name_("dummy"), + log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), + file_num_(1), + vstorage_(nullptr) { + mutable_cf_options_.ttl = 0; + mutable_cf_options_.periodic_compaction_seconds = 0; + // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of + // tests to cover. + ioptions_.compaction_pri = kByCompensatedSize; + fifo_options_.max_table_files_size = 1; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + ioptions_.cf_paths.emplace_back("dummy", + std::numeric_limits<uint64_t>::max()); + } + + ~CompactionPickerTestBase() override {} + + void NewVersionStorage(int num_levels, CompactionStyle style) { + DeleteVersionStorage(); + options_.num_levels = num_levels; + vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, + style, nullptr, false)); + vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); + } + + // Create a new VersionStorageInfo object so we can add mode files and then + // merge it with the existing VersionStorageInfo + void AddVersionStorage() { + temp_vstorage_.reset(new VersionStorageInfo( + &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style, + vstorage_.get(), false)); + } + + void DeleteVersionStorage() { + vstorage_.reset(); + temp_vstorage_.reset(); + files_.clear(); + file_map_.clear(); + input_files_.clear(); + } + + // REQUIRES: smallest and largest are c-style strings ending with '\0' + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 1, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, + size_t compensated_file_size = 0, bool marked_for_compact = false, + Temperature temperature = Temperature::kUnknown, + uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime, + Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) { + assert(ts_of_smallest.size() == ucmp_->timestamp_size()); + assert(ts_of_largest.size() == ucmp_->timestamp_size()); + + VersionStorageInfo* vstorage; + if (temp_vstorage_) { + vstorage = temp_vstorage_.get(); + } else { + vstorage = vstorage_.get(); + } + assert(level < vstorage->num_levels()); + char* smallest_key_buf = nullptr; + char* largest_key_buf = nullptr; + + if (!ts_of_smallest.empty()) { + smallest_key_buf = new char[strlen(smallest) + ucmp_->timestamp_size()]; + memcpy(smallest_key_buf, smallest, strlen(smallest)); + memcpy(smallest_key_buf + strlen(smallest), ts_of_smallest.data(), + ucmp_->timestamp_size()); + largest_key_buf = new char[strlen(largest) + ucmp_->timestamp_size()]; + memcpy(largest_key_buf, largest, strlen(largest)); + memcpy(largest_key_buf + strlen(largest), ts_of_largest.data(), + ucmp_->timestamp_size()); + } + + InternalKey smallest_ikey = InternalKey( + smallest_key_buf ? Slice(smallest_key_buf, + ucmp_->timestamp_size() + strlen(smallest)) + : smallest, + smallest_seq, kTypeValue); + InternalKey largest_ikey = InternalKey( + largest_key_buf + ? Slice(largest_key_buf, ucmp_->timestamp_size() + strlen(largest)) + : largest, + largest_seq, kTypeValue); + + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, smallest_ikey, largest_ikey, + smallest_seq, largest_seq, marked_for_compact, temperature, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); + f->compensated_file_size = + (compensated_file_size != 0) ? compensated_file_size : file_size; + f->oldest_ancester_time = oldest_ancestor_time; + vstorage->AddFile(level, f); + files_.emplace_back(f); + file_map_.insert({file_number, {f, level}}); + + delete[] smallest_key_buf; + delete[] largest_key_buf; + } + + void SetCompactionInputFilesLevels(int level_count, int start_level) { + input_files_.resize(level_count); + for (int i = 0; i < level_count; ++i) { + input_files_[i].level = start_level + i; + } + compaction_level_start_ = start_level; + } + + void AddToCompactionFiles(uint32_t file_number) { + auto iter = file_map_.find(file_number); + assert(iter != file_map_.end()); + int level = iter->second.second; + assert(level < vstorage_->num_levels()); + input_files_[level - compaction_level_start_].files.emplace_back( + iter->second.first); + } + + void UpdateVersionStorageInfo() { + if (temp_vstorage_) { + VersionBuilder builder(FileOptions(), &ioptions_, nullptr, + vstorage_.get(), nullptr); + ASSERT_OK(builder.SaveTo(temp_vstorage_.get())); + vstorage_ = std::move(temp_vstorage_); + } + vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->SetFinalized(); + } + + private: + Options CreateOptions(const Comparator* ucmp) const { + Options opts; + opts.comparator = ucmp; + return opts; + } + + std::unique_ptr<VersionStorageInfo> temp_vstorage_; +}; + +class CompactionPickerTest : public CompactionPickerTestBase { + public: + explicit CompactionPickerTest() + : CompactionPickerTestBase(BytewiseComparator()) {} + + ~CompactionPickerTest() override {} +}; + +class CompactionPickerU64TsTest : public CompactionPickerTestBase { + public: + explicit CompactionPickerU64TsTest() + : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {} + + ~CompactionPickerU64TsTest() override {} +}; + +TEST_F(CompactionPickerTest, Empty) { + NewVersionStorage(6, kCompactionStyleLevel); + UpdateVersionStorageInfo(); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Single) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "p", "q"); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Level0Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger2) { + mutable_cf_options_.target_file_size_base = 10000000000; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000001U); + Add(1, 88U, "201", "300", 1000000000U); + Add(2, 6U, "150", "179", 1000000000U); + Add(2, 7U, "180", "220", 1000000000U); + Add(2, 8U, "221", "300", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, LevelMaxScore) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + Add(0, 1U, "150", "200", 1000000U); + // Level 1 score 1.2 + Add(1, 66U, "150", "200", 6000000U); + Add(1, 88U, "201", "300", 6000000U); + // Level 2 score 1.8. File 7 is the largest. Should be picked + Add(2, 6U, "150", "179", 60000000U); + Add(2, 7U, "180", "220", 60000001U); + Add(2, 8U, "221", "300", 60000000U); + // Level 3 score slightly larger than 1 + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(mutable_cf_options_.target_file_size_base + + mutable_cf_options_.target_file_size_base / 10, + compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, NeedsCompactionLevel) { + const int kLevels = 6; + const int kFileCount = 20; + + for (int level = 0; level < kLevels - 1; ++level) { + NewVersionStorage(kLevels, kCompactionStyleLevel); + uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount; + for (int file_count = 1; file_count <= kFileCount; ++file_count) { + // start a brand new version in each test. + NewVersionStorage(kLevels, kCompactionStyleLevel); + for (int i = 0; i < file_count; ++i) { + Add(level, i, std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), file_size, 0, + i * 100, i * 100 + 99); + } + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + // release the version storage + DeleteVersionStorage(); + } + } +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 2); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 2, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 3, 5U, "150", "180", 3U); + Add(num_levels - 3, 6U, "181", "300", 3U); + Add(num_levels - 3, 7U, "400", "450", 3U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(num_levels - 3, compaction->level(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(num_levels - 1, 2U, "200", "250", 300U); + Add(num_levels - 1, 3U, "300", "350", 3000U); + Add(num_levels - 1, 4U, "400", "450", 3U); + Add(num_levels - 2, 5U, "150", "180", 300U); + Add(num_levels - 2, 6U, "181", "350", 500U); + Add(num_levels - 2, 7U, "400", "450", 200U); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +// Universal and FIFO Compactions are not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + // verify the trigger given different number of L0 files. + for (int i = 1; + i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) { + NewVersionStorage(1, kCompactionStyleUniversal); + Add(0, i, std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100, + i * 100 + 99); + UpdateVersionStorageInfo(); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { + const uint64_t kFileSize = 100000; + NewVersionStorage(1, kCompactionStyleUniversal); + ioptions_.allow_ingest_behind = true; + ioptions_.num_levels = 3; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // output level should be the one above the bottom-most + ASSERT_EQ(1, compaction->output_level()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files overlaps, they cannot +// be trivially moved. + +TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(!compaction->is_trivial_move()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files doesn't overlaps, they should +// be trivially moved. +TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 3U, "301", "350", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction->is_trivial_move()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { + // The case where universal periodic compaction does not + // pick up only level to compact if it doesn't cover + // any file marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { + // The case where universal periodic compaction does not + // pick up only the last sorted run which is an L0 file if it isn't + // marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(0, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { + // The case where universal periodic compaction couldn't form + // a compaction that includes any file marked for periodic compaction. + // Right now we form the compaction anyway if it is more than one + // sorted run. Just put the case here to validate that it doesn't + // crash. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(!compaction || + compaction->start_level() != compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { + // Test single L0 file periodic compaction triggering. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 6U, "150", "200", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { + // Test single sorted run non-L0 periodic compaction + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(4, 5U, "150", "200", kFileSize, 0, 500, 550); + Add(4, 6U, "350", "400", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 555555; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "310", "380", kFileSize, 0, 200, 251); + Add(3, 6U, "410", "880", kFileSize, 0, 200, 251); + Add(3, 7U, "910", "980", 1, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + // Add(4, 15U, "960", "970", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + // ASSERT_EQ(4U, compaction->num_input_files(1)); + ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber()); + ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 400000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(1, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 5U, "310", "380", kFileSize, 0, 200, 251); + Add(2, 6U, "410", "880", kFileSize, 0, 200, 251); + Add(2, 7U, "910", "980", kFileSize, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) { + // Test bottom level files falling between gaps between two upper level + // files + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 300000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "000", "180", kFileSize, 0, 200, 251); + Add(3, 6U, "181", "190", kFileSize, 0, 200, 251); + Add(3, 7U, "710", "810", kFileSize, 0, 200, 251); + Add(3, 8U, "820", "830", kFileSize, 0, 200, 251); + Add(3, 9U, "900", "991", kFileSize, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(2)); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) { + // Test compaction candidates always cover many files. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 3200000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + + // Generate files like following: + // L3: (1101, 1180) (1201, 1280) ... (7901, 7908) + // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010) + for (int i = 11; i < 79; i++) { + Add(3, 100 + i * 3, std::to_string(i * 100).c_str(), + std::to_string(i * 100 + 80).c_str(), kFileSize, 0, 200, 251); + // Add a tie breaker + if (i == 66) { + Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251); + } + + Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(), + std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251); + Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(), + std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251); + } + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(6U, compaction->num_input_files(0)); + ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber()); + ASSERT_EQ(11, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) { + // Test compaction candidates always cover many files with some single + // files larger than size threshold. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 3200000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + + // Generate files like following: + // L3: (1101, 1180) (1201, 1280) ... (7901, 7908) + // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010) + for (int i = 11; i < 70; i++) { + Add(3, 100 + i * 3, std::to_string(i * 100).c_str(), + std::to_string(i * 100 + 80).c_str(), + i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251); + + Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(), + std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251); + Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(), + std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251); + } + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(6U, compaction->num_input_files(0)); + ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber()); + ASSERT_EQ(13, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { + NewVersionStorage(1, kCompactionStyleFIFO); + const int kFileCount = + mutable_cf_options_.level0_file_num_compaction_trigger * 3; + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * kFileCount / 2; + + fifo_options_.max_table_files_size = kMaxSize; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false); + + // verify whether compaction is needed based on the current + // size of L0 files. + for (int i = 1; i <= kFileCount; ++i) { + NewVersionStorage(1, kCompactionStyleFIFO); + Add(0, i, std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100, + i * 100 + 99); + UpdateVersionStorageInfo(); + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST_F(CompactionPickerTest, FIFOToWarm1) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast<uint64_t>(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarm2) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast<uint64_t>(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 9; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast<uint64_t>(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kUnknown, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast<uint64_t>(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast<uint64_t>(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + // Stop if a file is being compacted + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast<uint64_t>(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kWarm, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + // Stop if a file is being compacted + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); +} + +#endif // ROCKSDB_LITE + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 100000000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); + // Compaction input size * 1.1 + ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smallest overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 25U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + // Overlaps with file 26, 27. And the file is compensated so will be + // picked up. + Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U); + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28 + + Add(3, 26U, "160", "165", 60000000U); + // Boosted file size in output level is not considered. + Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U); + Add(3, 28U, "180", "400", 60000000U); + Add(3, 29U, "401", "500", 60000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriRoundRobin) { + std::vector<InternalKey> test_cursors = {InternalKey("249", 100, kTypeValue), + InternalKey("600", 100, kTypeValue), + InternalKey()}; + std::vector<uint32_t> selected_files = {8U, 6U, 6U}; + + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_bytes_for_level_base = 12000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + for (size_t i = 0; i < test_cursors.size(); i++) { + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor + vstorage_->AddCursorForOneLevel(2, test_cursors[i]); + Add(2, 6U, "150", "199", 50000000U); // Overlap with 26U, 27U + Add(2, 7U, "200", "249", 50000000U); // File not overlapping + Add(2, 8U, "300", "600", 50000000U); // Overlap with 28U, 29U + + Add(3, 26U, "130", "165", 60000000U); + Add(3, 27U, "166", "170", 60000000U); + Add(3, 28U, "270", "340", 60000000U); + Add(3, 29U, "401", "500", 60000000U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr<Compaction> compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + // Since the max bytes for level 2 is 120M, picking one file to compact + // makes the post-compaction level size less than 120M, there is exactly one + // file picked for round-robin compaction + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(selected_files[i], compaction->input(0, 0)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); + } +} + +TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) { + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_compaction_bytes = 100000000u; + mutable_cf_options_.max_bytes_for_level_base = 120; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor (file picking should start with 7U) + vstorage_->AddCursorForOneLevel(2, InternalKey("199", 100, kTypeValue)); + Add(2, 6U, "150", "199", 500U); + Add(2, 7U, "200", "249", 500U); + Add(2, 8U, "300", "600", 500U); + Add(2, 9U, "700", "800", 500U); + Add(2, 10U, "850", "950", 500U); + + Add(3, 26U, "130", "165", 600U); + Add(3, 27U, "166", "170", 600U); + Add(3, 28U, "270", "340", 600U); + Add(3, 29U, "401", "500", 600U); + Add(3, 30U, "601", "800", 600U); + Add(3, 31U, "830", "890", 600U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr<Compaction> compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + + // The maximum compaction bytes is very large in this case so we can igore its + // constraint in this test case. The maximum bytes for level 2 is 1200 + // bytes, and thus at least 3 files should be picked so that the bytes in + // level 2 is less than the maximum + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(8U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(9U, compaction->input(0, 2)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) { + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_compaction_bytes = 2500u; + mutable_cf_options_.max_bytes_for_level_base = 120; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor (file picking should start with 6U) + vstorage_->AddCursorForOneLevel(2, InternalKey("1000", 100, kTypeValue)); + Add(2, 6U, "150", "199", 500U); // Overlap with 26U, 27U + Add(2, 7U, "200", "249", 500U); // Overlap with 27U + Add(2, 8U, "300", "600", 500U); // Overlap with 28U, 29U + Add(2, 9U, "700", "800", 500U); + Add(2, 10U, "850", "950", 500U); + + Add(3, 26U, "130", "165", 600U); + Add(3, 27U, "166", "230", 600U); + Add(3, 28U, "270", "340", 600U); + Add(3, 29U, "401", "500", 600U); + Add(3, 30U, "601", "800", 600U); + Add(3, 31U, "830", "890", 600U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr<Compaction> compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + + // The maximum compaction bytes is only 2500 bytes now. Even though we are + // required to choose 3 files so that the post-compaction level size is less + // than 1200 bytes. We cannot pick 3 files to compact since the maximum + // compaction size is 2500. After picking files 6U and 7U, the number of + // compaction bytes has reached 2200, and thus no more space to add another + // input file with 50M bytes. + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(0, 1)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) { + ioptions_.compaction_pri = kRoundRobin; + mutable_cf_options_.max_compaction_bytes = 1000000u; + mutable_cf_options_.max_bytes_for_level_base = 120; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + // start a brand new version in each test. + NewVersionStorage(6, kCompactionStyleLevel); + vstorage_->ResizeCompactCursors(6); + // Set the cursor (file picking should start with 9U) + vstorage_->AddCursorForOneLevel(2, InternalKey("700", 100, kTypeValue)); + Add(2, 6U, "150", "199", 500U); + Add(2, 7U, "200", "249", 500U); + Add(2, 8U, "300", "600", 500U); + Add(2, 9U, "700", "800", 500U); + Add(2, 10U, "850", "950", 500U); + + Add(3, 26U, "130", "165", 600U); + Add(3, 27U, "166", "170", 600U); + Add(3, 28U, "270", "340", 600U); + Add(3, 29U, "401", "500", 600U); + Add(3, 30U, "601", "800", 600U); + Add(3, 31U, "830", "890", 600U); + UpdateVersionStorageInfo(); + LevelCompactionPicker local_level_compaction_picker = + LevelCompactionPicker(ioptions_, &icmp_); + std::unique_ptr<Compaction> compaction( + local_level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + + // Cannot pick more files since we reach the last file in level 2 + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(9U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(10U, compaction->input(0, 1)->fd.GetNumber()); + // release the version storage + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 15000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 13U, "010", "011", + 6100U); // Overlaps with a large file. Not picked + Add(2, 14U, "020", "021", + 6100U); // Overlaps with a large file. Not picked + Add(2, 15U, "030", "031", + 6100U); // Overlaps with a large file. Not picked + Add(2, 16U, "040", "041", + 6100U); // Overlaps with a large file. Not picked + Add(2, 17U, "050", "051", + 6100U); // Overlaps with a large file. Not picked + Add(2, 18U, "060", "061", + 6100U); // Overlaps with a large file. Not picked + Add(2, 19U, "070", "071", + 6100U); // Overlaps with a large file. Not picked + Add(2, 20U, "080", "081", + 6100U); // Overlaps with a large file. Not picked + + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + Add(2, 9U, "610", "611", + 6100U); // Overlaps with a large file. Not picked + Add(2, 10U, "620", "621", + 6100U); // Overlaps with a large file. Not picked + Add(2, 11U, "630", "631", + 6100U); // Overlaps with a large file. Not picked + Add(2, 12U, "640", "641", + 6100U); // Overlaps with a large file. Not picked + + Add(3, 31U, "001", "100", 260000000U); + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + Add(3, 30U, "601", "700", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +// This test exhibits the bug where we don't properly reset parent_index in +// PickCompaction() +TEST_F(CompactionPickerTest, ParentIndexResetBug) { + int num_levels = ioptions_.num_levels; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); // <- marked for compaction + Add(1, 3U, "400", "500", 600); // <- this one needs compacting + Add(2, 4U, "150", "200"); + Add(2, 5U, "201", "210"); + Add(2, 6U, "300", "310"); + Add(2, 7U, "400", "500"); // <- being compacted + + vstorage_->LevelFiles(2)[3]->being_compacted = true; + vstorage_->LevelFiles(0)[0]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); +} + +// This test checks ExpandWhileOverlapping() by having overlapping user keys +// ranges (with different sequence numbers) in the input files. +TEST_F(CompactionPickerTest, OverlappingUserKeys) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kByCompensatedSize; + + Add(1, 1U, "100", "150", 1U); + // Overlapping user keys + Add(1, 2U, "200", "400", 1U); + Add(1, 3U, "400", "500", 1000000000U, 0, 0); + Add(2, 4U, "600", "700", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys2) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys3) { + NewVersionStorage(6, kCompactionStyleLevel); + // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to + // expand multiple times) + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "200", 1U, 0, 0); + Add(1, 3U, "200", "250", 1000000000U, 0, 0); + Add(1, 4U, "250", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys4) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_bytes_for_level_base = 1000000; + + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "199", 1U, 0, 0); + Add(1, 3U, "200", "250", 1100000U, 0, 0); + Add(1, 4U, "251", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + + Add(2, 6U, "100", "115", 1U); + Add(2, 7U, "125", "325", 1U); + Add(2, 8U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys5) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(2)[2]->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys6) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "300", 1U, 0, 0); + Add(2, 5U, "305", "450", 1U, 0, 0); + Add(2, 6U, "460", "600", 1U, 0, 0); + Add(2, 7U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(1)[0]->marked_for_compaction = true; + vstorage_->LevelFiles(1)[1]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys7) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1000000000U, 0, 0); + Add(2, 3U, "100", "250", 1U); + Add(2, 4U, "300", "600", 1U, 0, 0); + Add(2, 5U, "600", "800", 1U, 0, 0); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_GE(1U, compaction->num_input_files(0)); + ASSERT_GE(2U, compaction->num_input_files(1)); + // File 5 has to be included in the compaction + ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys8) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // no overlapping case + Add(1, 1U, "101", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "150", "200", 1U); + Add(2, 7U, "200", "450", 1U, 0, 0); + Add(2, 8U, "500", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys9) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // overlapping case + Add(1, 1U, "121", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "100", "120", 1U); + Add(2, 7U, "150", "200", 1U); + Add(2, 8U, "200", "450", 1U, 0, 0); + Add(2, 9U, "501", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys10) { + // Locked file encountered when pulling in extra input-level files with same + // user keys. Verify we pick the next-best file from the same input level. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // file_number 2U is largest and thus first choice. But it overlaps with + // file_number 1U which is being compacted. So instead we pick the next- + // biggest file, 3U, which is eligible for compaction. + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 900000000U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys11) { + // Locked file encountered when pulling in extra output-level files with same + // user keys. Expected to skip that compaction and pick the next-best choice. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // score(L1) = 3.7 + // score(L2) = 1.85 + // There is no eligible file in L1 to compact since both candidates pull in + // file_number 5U, which overlaps with a file pending compaction (6U). The + // first eligible compaction is from L2->L3. + Add(1 /* level */, 2U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "149" /* largest */, 5000000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "201" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + file_map_[6U].first->being_compacted = true; + Add(3 /* level */, 7U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FileTtlBooster) { + // Set TTL to 2048 + // TTL boosting for all levels starts at 1024, + // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960. + // From second last level (L5), range starts at + // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3). + // Boosting step 124 / 16 = 7.75 -> 7 + // + const uint64_t kCurrentTime = 1000000; + FileMetaData meta; + + { + FileTtlBooster booster(kCurrentTime, 2048, 7, 3); + + // Not triggering if the file is younger than ttl/2 + meta.oldest_ancester_time = kCurrentTime - 1023; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - 1024; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime + 10; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + + // Within one boosting step + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + + // One boosting step + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7); + ASSERT_EQ(2, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8); + ASSERT_EQ(2, booster.GetBoostScore(&meta)); + + // Multiple boosting steps + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30); + ASSERT_EQ(5, booster.GetBoostScore(&meta)); + + // Very high boosting steps + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700); + ASSERT_EQ(101, booster.GetBoostScore(&meta)); + } + { + // Test second last level + FileTtlBooster booster(kCurrentTime, 2048, 7, 5); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60); + ASSERT_EQ(3, booster.GetBoostScore(&meta)); + } + { + // Test last level + FileTtlBooster booster(kCurrentTime, 2048, 7, 6); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - 3000; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + } +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // No compaction should be scheduled, if L0 has higher priority than L1 + // but L0->L1 compaction is blocked by a file in L1 being compacted. + UpdateVersionStorageInfo(); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If no file in L1 being compacted, L0->L1 compaction will be scheduled. + UpdateVersionStorageInfo(); // being_compacted flag is cleared here. + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 score more than 6. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + Add(1, 51U, "351", "400", 6000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If score in L1 is larger than L0, L1 compaction goes through despite + // there is pending L0 compaction. + UpdateVersionStorageInfo(); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 3U, "150", "200", 200); + // Level 1 is over target by 200 + Add(1, 4U, "400", "500", 600); + Add(1, 5U, "600", "700", 600); + // Level 2 is less than target 10000 even added size of level 1 + // Size ratio of L2/L1 is 9600 / 1200 = 8 + Add(2, 6U, "150", "200", 2500); + Add(2, 7U, "201", "210", 2000); + Add(2, 8U, "300", "310", 2600); + Add(2, 9U, "400", "500", 2500); + // Level 3 exceeds target 100,000 of 1000 + Add(3, 10U, "400", "500", 101000); + // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3 + // Size ratio L4/L3 is 9.9 + // After merge from L3, L4 size is 1000900 + Add(4, 11U, "400", "500", 999900); + Add(5, 12U, "400", "500", 8007200); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(200u * 9u + 10900u + 900u * 9, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // Level 1 size will be 1400 after merging with L0 + Add(1, 7U, "400", "500", 200); + Add(1, 8U, "600", "700", 200); + // Level 2 is less than target 10000 even added size of level 1 + Add(2, 9U, "150", "200", 9100); + // Level 3 over the target, but since level 4 is empty, we assume it will be + // a trivial move. + Add(3, 10U, "400", "500", 101000); + + UpdateVersionStorageInfo(); + + // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0) + ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 2000); + Add(0, 2U, "150", "200", 2000); + Add(0, 4U, "150", "200", 2000); + Add(0, 5U, "150", "200", 2000); + Add(0, 6U, "150", "200", 1000); + // Level 1 size will be 10000 after merging with L0 + Add(1, 7U, "400", "500", 500); + Add(1, 8U, "600", "700", 500); + + Add(2, 9U, "150", "200", 10000); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + // Set Last level size 50000 + // num_levels - 1 target 5000 + // num_levels - 2 is base level with target 1000 (rounded up to + // max_bytes_for_level_base). + Add(num_levels - 1, 10U, "400", "500", 50000); + + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // num_levels - 3 is over target by 100 + 1000 + Add(num_levels - 3, 7U, "400", "500", 550); + Add(num_levels - 3, 8U, "600", "700", 550); + // num_levels - 2 is over target by 1100 + 200 + Add(num_levels - 2, 9U, "150", "200", 5200); + + UpdateVersionStorageInfo(); + + // Merging to the second last level: (5200 / 2100 + 1) * 1100 + // Merging to the last level: (50000 / 6300 + 1) * 1300 + ASSERT_EQ(2100u + 3823u + 11617u, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, IsBottommostLevelTest) { + // case 1: Higher levels are empty + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + bool result = + Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 2: Higher levels have no overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "k", "p"); + Add(3, 8U, "t", "w"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 3.1: Higher levels (level 3) have overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "e", "g"); + Add(3, 8U, "h", "k"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.2: Higher levels (level 5) have overlap + DeleteVersionStorage(); + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "h", "k"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping + // one key ("d") + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "ccc", "d"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "z"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files don't overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // Level 1 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(1, 5U, "a", "m"); + Add(1, 6U, "n", "o"); + Add(1, 7U, "w", "y"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + AddToCompactionFiles(5U); + AddToCompactionFiles(6U); + AddToCompactionFiles(7U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 800000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // It can expand because adding file 1 and 3, the compaction size will + // exceed mutable_cf_options_.max_bytes_for_level_base. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "100", "256", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { + mutable_cf_options_.max_bytes_for_level_base = 800000u; + mutable_cf_options_.max_compaction_bytes = 1000000u; + mutable_cf_options_.ignore_max_compaction_bytes_for_input = false; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // and it expands to file 1 and 3 too. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "000", "251", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOn) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMove1) { + mutable_cf_options_.max_bytes_for_level_base = 10000000u; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(0, 1U, "100", "150", 3000U, 0, 710, 800); + Add(0, 2U, "151", "200", 3001U, 0, 610, 700); + Add(0, 3U, "301", "350", 3000U, 0, 510, 600); + Add(0, 4U, "451", "400", 3000U, 0, 410, 500); + + Add(1, 5U, "120", "130", 7000U); + Add(1, 6U, "170", "180", 7000U); + Add(1, 7U, "220", "230", 7000U); + Add(1, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) { + mutable_cf_options_.max_bytes_for_level_base = 10000000u; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(0, 1U, "100", "150", 3000U, 0, 710, 800); + Add(0, 2U, "551", "600", 3001U, 0, 610, 700); + Add(0, 3U, "101", "150", 3000U, 0, 510, 600); + Add(0, 4U, "451", "400", 3000U, 0, 410, 500); + + Add(1, 5U, "120", "130", 7000U); + Add(1, 6U, "170", "180", 7000U); + Add(1, 7U, "220", "230", 7000U); + Add(1, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->num_input_files(0)); + ASSERT_EQ(4, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) { + mutable_cf_options_.max_bytes_for_level_base = 10000000u; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(0, 1U, "300", "350", 3000U, 0, 710, 800); + Add(0, 2U, "651", "600", 3001U, 0, 610, 700); + Add(0, 3U, "501", "550", 3000U, 0, 510, 600); + Add(0, 4U, "451", "400", 3000U, 0, 410, 500); + + Add(1, 5U, "120", "130", 7000U); + Add(1, 6U, "970", "980", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(4, compaction->num_input_files(0)); + ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(2, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1); + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + // No trivial move, because partitioning is applied + ASSERT_TRUE(!compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOff) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 10000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick all files from level 1 + Add(1, 1U, "100", "150", 300000U, 0, 0); + Add(1, 2U, "150", "200", 300000U, 0, 0); + Add(1, 3U, "200", "250", 300000U, 0, 0); + Add(1, 4U, "250", "300", 300000U, 0, 0); + + Add(3, 5U, "120", "130", 6000U); + Add(3, 6U, "140", "150", 6000U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_FALSE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "200", 3001U); + Add(2, 3U, "301", "350", 3000U); + Add(2, 4U, "451", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 16U, "170", "180", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(4, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "160", 3001U); + Add(2, 3U, "161", "179", 3000U); + Add(2, 4U, "220", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + // Even if consecutive files can be trivial moved, we don't pick them + // since in case trivial move can't be issued for a reason, we cannot + // fall back to normal compactions. + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "160", 3001U); + Add(2, 5U, "551", "500", 3000U); + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 4000U); + Add(2, 2U, "151", "160", 4001U); + Add(2, 3U, "161", "179", 4000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + // File 4 and 5 aren't clean cut, so only 2 and 3 are picked. + Add(2, 1U, "100", "150", 4000U); + Add(2, 2U, "151", "160", 4001U); + Add(2, 3U, "161", "179", 4000U); + Add(2, 4U, "180", "185", 4000U); + Add(2, 5U, "185", "190", 4000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) { + mutable_cf_options_.max_bytes_for_level_base = 1000u; + mutable_cf_options_.max_compaction_bytes = 10000001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.compaction_pri = kMinOverlappingRatio; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(2, 1U, "100", "150", 3000U); + Add(2, 2U, "151", "200", 3001U); + Add(2, 3U, "301", "350", 3000U); + Add(2, 4U, "451", "400", 3000U); + Add(2, 5U, "551", "500", 3000U); + file_map_[5U].first->being_compacted = true; + Add(2, 6U, "651", "600", 3000U); + Add(2, 7U, "751", "700", 3000U); + Add(2, 8U, "851", "900", 3000U); + + Add(3, 15U, "120", "130", 700U); + Add(3, 16U, "170", "180", 700U); + Add(3, 17U, "220", "230", 700U); + Add(3, 18U, "870", "880", 700U); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); + ASSERT_EQ(1, compaction->num_input_levels()); + // Since the next file is being compacted. Stopping at 3 and 4. + ASSERT_EQ(2, compaction->num_input_files(0)); + ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1000000000U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "199" /* largest */, 900000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "200" /* smallest */, + "249" /* largest */, 800000000U /* file_size */); + Add(1 /* level */, 4U /* file_number */, "250" /* smallest */, + "299" /* largest */, 700000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "199" /* largest */, 100U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "200" /* smallest */, + "240" /* largest */, 1U /* file_size */); + Add(2 /* level */, 7U /* file_number */, "260" /* smallest */, + "270" /* largest */, 1U /* file_size */); + file_map_[5U].first->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); + ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 1000000u; + NewVersionStorage(6, kCompactionStyleLevel); + + // All 5 L0 files will be picked for intra L0 compaction. The one L1 file + // spans entire L0 key range and is marked as being compacted to avoid + // L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 110, 111); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 5 L0 files will be picked for intra L0 compaction due to + // max_compaction_bytes limit (the minimum number of files for triggering + // intra L0 compaction is 4). The one L1 file spans entire L0 key range and + // is marked as being compacted to avoid L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 109, 110); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 6 L0 files will be picked for intra L0 compaction due to + // being_compact limit. And the latest one L0 will be skipped due to earliest + // seqno. The one L1 file spans entire L0 key range and is marked as being + // compacted to avoid L0->L1 compaction. + Add(1, 1U, "100", "350", 200000U, 0, 110, 111); + Add(0, 2U, "301", "350", 1U, 0, 108, 109); + Add(0, 3U, "251", "300", 1U, 0, 106, 107); + Add(0, 4U, "201", "250", 1U, 0, 104, 105); + Add(0, 5U, "151", "200", 1U, 0, 102, 103); + Add(0, 6U, "100", "150", 1U, 0, 100, 101); + Add(0, 7U, "100", "100", 1U, 0, 99, 100); + vstorage_->LevelFiles(0)[5]->being_compacted = true; + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_, 107)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a "regular" universal compaction is + // scheduled first, followed by a delete triggered compaction. The latter + // should fail + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a compaction to reduce sorted runs + ASSERT_EQ(CompactionReason::kUniversalSortedRunNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + + AddVersionStorage(); + // Simulate a flush and mark the file for compaction + Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); +} + +TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled first, followed by a "regular" compaction. The latter + // should fail + NewVersionStorage(5, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250); + Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(3, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + + AddVersionStorage(); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); +} + +TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + + bool input_level_overlap = false; + bool output_level_overlap = false; + // Let's mark 2 files in 2 different levels for compaction. The + // compaction picker will randomly pick one, so use the sync point to + // ensure a deterministic order. Loop until both cases are covered + size_t random_index = 0; + SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) { + size_t* index = static_cast<size_t*>(arg); + *index = random_index; + }); + SyncPoint::GetInstance()->EnableProcessing(); + while (!input_level_overlap || !output_level_overlap) { + // Ensure that the L0 file gets picked first + random_index = !input_level_overlap ? 0 : 1; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248); + Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249); + Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250); + Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true); + Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_TRUE(compaction->start_level() == 0 || + compaction->start_level() == 3); + if (compaction->start_level() == 0) { + // The L0 file was picked. The next compaction will detect an + // overlap on its input level + input_level_overlap = true; + ASSERT_EQ(3, compaction->output_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + } else { + // The level 3 file was picked. The next compaction will pick + // the L0 file and will detect overlap when adding output + // level inputs + output_level_overlap = true; + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + } + + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + // After recomputing the compaction score, only one marked file will remain + random_index = 0; + std::unique_ptr<Compaction> compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); + DeleteVersionStorage(); + } +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled and should result in a full compaction + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[6].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a file is being compacted, and a + // delete triggered compaction is then scheduled. The latter should stop + // at the first file being compacted + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + file_map_[3].first->being_compacted = true; + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled first, followed by a "regular" compaction. The latter + // should fail + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 5 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[5].first->being_compacted); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[6].first->being_compacted); + + AddVersionStorage(); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction2); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[1].first->being_compacted); + ASSERT_TRUE(file_map_[2].first->being_compacted); + ASSERT_TRUE(file_map_[4].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + + // This test makes sure the `files_marked_for_compaction_` is updated after + // creating manual compaction. + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + + // Add 3 files marked for compaction + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); + UpdateVersionStorageInfo(); + + // All 3 files are marked for compaction + ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size()); + + bool manual_conflict = false; + InternalKey* manual_end = nullptr; + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.CompactRange( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(), + nullptr, nullptr, &manual_end, &manual_conflict, + std::numeric_limits<uint64_t>::max(), "")); + + ASSERT_TRUE(compaction); + + ASSERT_EQ(CompactionReason::kManualCompaction, + compaction->compaction_reason()); + ASSERT_EQ(kNumLevels - 1, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); + + // After creating the manual compaction, all files should be cleared from + // `FilesMarkedForCompaction`. So they won't be picked by others. + ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size()); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { + // This test make sure size amplification compaction could still be triggered + // if the last sorted run is not the last level. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(4, 90U, "100", "600", 4 * kFileSize); + Add(5, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Make sure it's a size amp compaction and includes all files + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(4)->num_files, 1); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { + // This test makes sure the size amp calculation skips the last level (L6), so + // size amp compaction is not triggered, instead a size ratio compaction is + // triggered. + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(5, 90U, "100", "600", 4 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // Internally, size amp compaction is evaluated before size ratio compaction. + // Here to make sure it's size ratio compaction instead of size amp + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeRatio); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 0); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { + // Tiered compaction only support level_num > 2 (otherwise the penultimate + // level is going to be level 0, which may make thing more complicated), so + // when there's only 2 level, still treating level 1 as the last level for + // size amp compaction + const uint64_t kFileSize = 100000; + const int kNumLevels = 2; + const int kLastLevel = kNumLevels - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 1 * kFileSize); + Add(0, 101U, "200", "400", 1 * kFileSize); + Add(0, 90U, "100", "600", 4 * kFileSize); + Add(1, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // size amp compaction is still triggered even preclude_last_level is set + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kLastLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 3); + ASSERT_EQ(compaction->input_levels(1)->num_files, 1); +} + +TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { + // This test makes sure the size amp compaction for tiered storage could still + // be triggered, but only for non-last-level files + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + const int kLastLevel = kNumLevels - 1; + const int kPenultimateLevel = kLastLevel - 1; + + ioptions_.compaction_style = kCompactionStyleUniversal; + ioptions_.preclude_last_level_data_seconds = 1000; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 200; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + Add(0, 100U, "100", "300", 3 * kFileSize); + Add(0, 101U, "200", "400", 2 * kFileSize); + Add(5, 90U, "100", "600", 2 * kFileSize); + Add(6, 80U, "200", "300", 2 * kFileSize); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + // It's a Size Amp compaction, but doesn't include the last level file and + // output to the penultimate level. + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->output_level(), kPenultimateLevel); + ASSERT_EQ(compaction->input_levels(0)->num_files, 2); + ASSERT_EQ(compaction->input_levels(5)->num_files, 1); + ASSERT_EQ(compaction->input_levels(6)->num_files, 0); +} + +TEST_F(CompactionPickerU64TsTest, Overlap) { + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + constexpr int level = 0; + constexpr uint64_t file_number = 20ULL; + constexpr char smallest[4] = "500"; + constexpr char largest[4] = "600"; + constexpr uint64_t ts_of_smallest = 12345ULL; + constexpr uint64_t ts_of_largest = 56789ULL; + + { + std::string ts1; + PutFixed64(&ts1, ts_of_smallest); + std::string ts2; + PutFixed64(&ts2, ts_of_largest); + Add(level, file_number, smallest, largest, + /*file_size=*/1U, /*path_id=*/0, + /*smallest_seq=*/100, /*largest_seq=*/100, /*compensated_file_size=*/0, + /*marked_for_compact=*/false, /*temperature=*/Temperature::kUnknown, + /*oldest_ancestor_time=*/kUnknownOldestAncesterTime, ts1, ts2); + UpdateVersionStorageInfo(); + } + + std::unordered_set<uint64_t> input{file_number}; + + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input, vstorage_.get(), CompactionOptions())); + std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles( + CompactionOptions(), input_files, level, vstorage_.get(), + mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0)); + + { + // [600, ts=50000] to [600, ts=50000] is the range to check. + // ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) > 0, but + // ucmp->CompareWithoutTimestamp(smallest_user_key, + // c->GetLargestUserKey()) == 0. + // Should still be considered overlapping. + std::string user_key_with_ts1(largest); + PutFixed64(&user_key_with_ts1, ts_of_largest - 1); + std::string user_key_with_ts2(largest); + PutFixed64(&user_key_with_ts2, ts_of_largest - 1); + ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction( + user_key_with_ts1, user_key_with_ts2, level)); + } + { + // [500, ts=60000] to [500, ts=60000] is the range to check. + // ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) < 0, but + // ucmp->CompareWithoutTimestamp(largest_user_key, + // c->GetSmallestUserKey()) == 0. + // Should still be considered overlapping. + std::string user_key_with_ts1(smallest); + PutFixed64(&user_key_with_ts1, ts_of_smallest + 1); + std::string user_key_with_ts2(smallest); + PutFixed64(&user_key_with_ts2, ts_of_smallest + 1); + ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction( + user_key_with_ts1, user_key_with_ts2, level)); + } +} + +TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) { + constexpr uint64_t kFileSize = 100000; + + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_FALSE(universal_compaction_picker.NeedsCompaction(vstorage_.get())); + + std::string ts1; + PutFixed64(&ts1, 9000); + std::string ts2; + PutFixed64(&ts2, 8000); + std::string ts3; + PutFixed64(&ts3, 7000); + std::string ts4; + PutFixed64(&ts4, 6000); + + NewVersionStorage(3, kCompactionStyleUniversal); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100, + /*largest_seq=*/100, /*compensated_file_size=*/kFileSize, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, ts1, ts2); + Add(2, 2U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100, + /*largest_seq=*/100, /*compensated_file_size=*/kFileSize, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, ts3, ts4); + UpdateVersionStorageInfo(); + + std::unique_ptr<Compaction> compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + assert(compaction); + ASSERT_TRUE(!compaction->is_trivial_move()); +} + +class PerKeyPlacementCompactionPickerTest + : public CompactionPickerTest, + public testing::WithParamInterface<bool> { + public: + PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {} + + void SetUp() override { enable_per_key_placement_ = GetParam(); } + + protected: + bool enable_per_key_placement_ = false; +}; + +TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast<bool*>(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(5, 40U, "200", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(40); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + level_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, + 0, 6))); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast<bool*>(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(40); + input_set.insert(41); + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + level_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + OverlapWithNormalCompactionUniveral) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast<bool*>(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(5, 40U, "200", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(40); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, + 0, 6))); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast<bool*>(arg); + *supports_per_key_placement = enable_per_key_placement_; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + int num_levels = ioptions_.num_levels; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + Add(0, 21U, "100", "150", 60000000U); + Add(0, 22U, "300", "350", 60000000U); + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(6, 50U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(21); + input_set.insert(22); + input_set.insert(50); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(40); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) { + // This test is make sure the Tiered compaction would lock whole range of + // both output level and penultimate level + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + // the existing compaction is the 1st L4 file + L6 file + // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped + // when the tiered compaction feature is on. + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(40); + input_set.insert(60); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + input_set.clear(); + input_files.clear(); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); + + // compacting the 3rd L4 file is always safe: + input_set.clear(); + input_files.clear(); + input_set.insert(42); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) { + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(60); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + // cannot compact file 41 if the preclude_last_level feature is on, otherwise + // compact file 41 is okay. + input_set.clear(); + input_files.clear(); + input_set.insert(41); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); + + // compacting the 3rd L4 file is always safe: + input_set.clear(); + input_files.clear(); + input_set.insert(42); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + LastLevelOnlyFailPenultimateUniversal) { + // This is to test last_level only compaction still unable to do the + // penultimate level compaction if there's already a file in the penultimate + // level. + // This should rarely happen in universal compaction, as the non-empty L5 + // should be included in the compaction. + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] + // L5: [230, 250] + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(5, 50U, "230", "250", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(60); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + ASSERT_TRUE(comp1); + ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + + // As comp1 cannot be output to the penultimate level, compacting file 40 to + // L5 is always safe. + input_set.clear(); + input_files.clear(); + input_set.insert(40); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 5, Compaction::kInvalidLevel)); + + std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + ASSERT_TRUE(comp2); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + LastLevelOnlyConflictWithOngoingUniversal) { + // This is to test last_level only compaction still unable to do the + // penultimate level compaction if there's already an ongoing compaction to + // the penultimate level + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + // create an ongoing compaction to L5 (penultimate level) + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(40); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + ASSERT_TRUE(comp1); + ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + + input_set.clear(); + input_files.clear(); + input_set.insert(60); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + ASSERT_EQ(enable_per_key_placement_, + universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, + 6, 6))); + + if (!enable_per_key_placement_) { + std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + ASSERT_TRUE(comp2); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + } +} + +TEST_P(PerKeyPlacementCompactionPickerTest, + LastLevelOnlyNoConflictWithOngoingUniversal) { + // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only + // change is the ongoing compaction to L5 has no overlap with the last level + // compaction, so it's safe to move data from the last level to the + // penultimate level. + if (enable_per_key_placement_) { + ioptions_.preclude_last_level_data_seconds = 10000; + } + + int num_levels = ioptions_.num_levels; + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(num_levels, kCompactionStyleUniversal); + + // L4: [200, 220] [230, 250] [360, 380] + // L5: + // L6: [101, 351] + Add(4, 40U, "200", "220", 60000000U); + Add(4, 41U, "230", "250", 60000000U); + Add(4, 42U, "360", "380", 60000000U); + Add(6, 60U, "101", "351", 60000000U); + UpdateVersionStorageInfo(); + + // create an ongoing compaction to L5 (penultimate level) + CompactionOptions comp_options; + std::unordered_set<uint64_t> input_set; + input_set.insert(42); + std::vector<CompactionInputFiles> input_files; + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + + ASSERT_TRUE(comp1); + ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + + input_set.clear(); + input_files.clear(); + input_set.insert(60); + ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage_.get(), comp_options)); + + // always safe to move data up + ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( + input_files, 6, + Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, 6, 6))); + + // 2 compactions can be run in parallel + std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); + ASSERT_TRUE(comp2); + if (enable_per_key_placement_) { + ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + } else { + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + } +} + +INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest, + PerKeyPlacementCompactionPickerTest, ::testing::Bool()); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc new file mode 100644 index 000000000..376e4f60f --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc @@ -0,0 +1,1450 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_universal.h" +#ifndef ROCKSDB_LITE + +#include <cinttypes> +#include <limits> +#include <queue> +#include <string> +#include <utility> + +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// A helper class that form universal compactions. The class is used by +// UniversalCompactionPicker::PickCompaction(). +// The usage is to create the class, and get the compaction object by calling +// PickCompaction(). +class UniversalCompactionBuilder { + public: + UniversalCompactionBuilder( + const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, LogBuffer* log_buffer) + : ioptions_(ioptions), + icmp_(icmp), + cf_name_(cf_name), + mutable_cf_options_(mutable_cf_options), + mutable_db_options_(mutable_db_options), + vstorage_(vstorage), + picker_(picker), + log_buffer_(log_buffer) {} + + // Form and return the compaction object. The caller owns return object. + Compaction* PickCompaction(); + + private: + struct SortedRun { + SortedRun(int _level, FileMetaData* _file, uint64_t _size, + uint64_t _compensated_file_size, bool _being_compacted) + : level(_level), + file(_file), + size(_size), + compensated_file_size(_compensated_file_size), + being_compacted(_being_compacted) { + assert(compensated_file_size > 0); + assert(level != 0 || file != nullptr); + } + + void Dump(char* out_buf, size_t out_buf_size, + bool print_path = false) const; + + // sorted_run_count is added into the string to print + void DumpSizeInfo(char* out_buf, size_t out_buf_size, + size_t sorted_run_count) const; + + int level; + // `file` Will be null for level > 0. For level = 0, the sorted run is + // for this file. + FileMetaData* file; + // For level > 0, `size` and `compensated_file_size` are sum of sizes all + // files in the level. `being_compacted` should be the same for all files + // in a non-zero level. Use the value here. + uint64_t size; + uint64_t compensated_file_size; + bool being_compacted; + }; + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionToReduceSizeAmp(); + + // Try to pick incremental compaction to reduce space amplification. + // It will return null if it cannot find a fanout within the threshold. + // Fanout is defined as + // total size of files to compact at output level + // -------------------------------------------------- + // total size of files to compact at other levels + Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold); + + Compaction* PickDeleteTriggeredCompaction(); + + // Form a compaction from the sorted run indicated by start_index to the + // oldest sorted run. + // The caller is responsible for making sure that those files are not in + // compaction. + Compaction* PickCompactionToOldest(size_t start_index, + CompactionReason compaction_reason); + + Compaction* PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason); + + // Try to pick periodic compaction. The caller should only call it + // if there is at least one file marked for periodic compaction. + // null will be returned if no such a compaction can be formed + // because some files are being compacted. + Compaction* PickPeriodicCompaction(); + + // Used in universal compaction when the allow_trivial_move + // option is set. Checks whether there are any overlapping files + // in the input. Returns true if the input files are non + // overlapping. + bool IsInputFilesNonOverlapping(Compaction* c); + + uint64_t GetMaxOverlappingBytes() const; + + const ImmutableOptions& ioptions_; + const InternalKeyComparator* icmp_; + double score_; + std::vector<SortedRun> sorted_runs_; + const std::string& cf_name_; + const MutableCFOptions& mutable_cf_options_; + const MutableDBOptions& mutable_db_options_; + VersionStorageInfo* vstorage_; + UniversalCompactionPicker* picker_; + LogBuffer* log_buffer_; + + static std::vector<SortedRun> CalculateSortedRuns( + const VersionStorageInfo& vstorage); + + // Pick a path ID to place a newly generated file, with its estimated file + // size. + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + uint64_t file_size); +}; + +// Used in universal compaction when trivial move is enabled. +// This structure is used for the construction of min heap +// that contains the file meta data, the level of the file +// and the index of the file in that level + +struct InputFileInfo { + InputFileInfo() : f(nullptr), level(0), index(0) {} + + FileMetaData* f; + size_t level; + size_t index; +}; + +// Used in universal compaction when trivial move is enabled. +// This comparator is used for the construction of min heap +// based on the smallest key of the file. +struct SmallestKeyHeapComparator { + explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; } + + bool operator()(InputFileInfo i1, InputFileInfo i2) const { + return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(), + i2.f->smallest.user_key()) > 0); + } + + private: + const Comparator* ucmp_; +}; + +using SmallestKeyHeap = + std::priority_queue<InputFileInfo, std::vector<InputFileInfo>, + SmallestKeyHeapComparator>; + +// This function creates the heap that is used to find if the files are +// overlapping during universal compaction when the allow_trivial_move +// is set. +SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { + SmallestKeyHeap smallest_key_priority_q = + SmallestKeyHeap(SmallestKeyHeapComparator(ucmp)); + + InputFileInfo input_file; + + for (size_t l = 0; l < c->num_input_levels(); l++) { + if (c->num_input_files(l) != 0) { + if (l == 0 && c->start_level() == 0) { + for (size_t i = 0; i < c->num_input_files(0); i++) { + input_file.f = c->input(0, i); + input_file.level = 0; + input_file.index = i; + smallest_key_priority_q.push(std::move(input_file)); + } + } else { + input_file.f = c->input(l, 0); + input_file.level = l; + input_file.index = 0; + smallest_key_priority_q.push(std::move(input_file)); + } + } + } + return smallest_key_priority_q; +} + +#ifndef NDEBUG +// smallest_seqno and largest_seqno are set iff. `files` is not empty. +void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files, + SequenceNumber* smallest_seqno, + SequenceNumber* largest_seqno) { + bool is_first = true; + for (FileMetaData* f : files) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + *smallest_seqno = f->fd.smallest_seqno; + *largest_seqno = f->fd.largest_seqno; + } else { + if (f->fd.smallest_seqno < *smallest_seqno) { + *smallest_seqno = f->fd.smallest_seqno; + } + if (f->fd.largest_seqno > *largest_seqno) { + *largest_seqno = f->fd.largest_seqno; + } + } + } +} +#endif +} // namespace + +// Algorithm that checks to see if there are any overlapping +// files in the input +bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { + auto comparator = icmp_->user_comparator(); + int first_iter = 1; + + InputFileInfo prev, curr, next; + + SmallestKeyHeap smallest_key_priority_q = + create_level_heap(c, icmp_->user_comparator()); + + while (!smallest_key_priority_q.empty()) { + curr = smallest_key_priority_q.top(); + smallest_key_priority_q.pop(); + + if (first_iter) { + prev = curr; + first_iter = 0; + } else { + if (comparator->CompareWithoutTimestamp( + prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) { + // found overlapping files, return false + return false; + } + assert(comparator->CompareWithoutTimestamp( + curr.f->largest.user_key(), prev.f->largest.user_key()) > 0); + prev = curr; + } + + next.f = nullptr; + + if (c->level(curr.level) != 0 && + curr.index < c->num_input_files(curr.level) - 1) { + next.f = c->input(curr.level, curr.index + 1); + next.level = curr.level; + next.index = curr.index + 1; + } + + if (next.f) { + smallest_key_priority_q.push(std::move(next)); + } + } + return true; +} + +bool UniversalCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + if (vstorage->CompactionScore(kLevel0) >= 1) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + return false; +} + +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) { + UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, + mutable_cf_options, mutable_db_options, + vstorage, this, log_buffer); + return builder.PickCompaction(); +} + +void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, + size_t out_buf_size, + bool print_path) const { + if (level == 0) { + assert(file != nullptr); + if (file->fd.GetPathId() == 0 || !print_path) { + snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber()); + } else { + snprintf(out_buf, out_buf_size, + "file %" PRIu64 + "(path " + "%" PRIu32 ")", + file->fd.GetNumber(), file->fd.GetPathId()); + } + } else { + snprintf(out_buf, out_buf_size, "level %d", level); + } +} + +void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( + char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { + if (level == 0) { + assert(file != nullptr); + snprintf(out_buf, out_buf_size, + "file %" PRIu64 "[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(), + file->compensated_file_size); + } else { + snprintf(out_buf, out_buf_size, + "level %d[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + level, sorted_run_count, size, compensated_file_size); + } +} + +std::vector<UniversalCompactionBuilder::SortedRun> +UniversalCompactionBuilder::CalculateSortedRuns( + const VersionStorageInfo& vstorage) { + std::vector<UniversalCompactionBuilder::SortedRun> ret; + for (FileMetaData* f : vstorage.LevelFiles(0)) { + ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, + f->being_compacted); + } + for (int level = 1; level < vstorage.num_levels(); level++) { + uint64_t total_compensated_size = 0U; + uint64_t total_size = 0U; + bool being_compacted = false; + for (FileMetaData* f : vstorage.LevelFiles(level)) { + total_compensated_size += f->compensated_file_size; + total_size += f->fd.GetFileSize(); + // Size amp, read amp and periodic compactions always include all files + // for a non-zero level. However, a delete triggered compaction and + // a trivial move might pick a subset of files in a sorted run. So + // always check all files in a sorted run and mark the entire run as + // being compacted if one or more files are being compacted + if (f->being_compacted) { + being_compacted = f->being_compacted; + } + } + if (total_compensated_size > 0) { + ret.emplace_back(level, nullptr, total_size, total_compensated_size, + being_compacted); + } + } + return ret; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +Compaction* UniversalCompactionBuilder::PickCompaction() { + const int kLevel0 = 0; + score_ = vstorage_->CompactionScore(kLevel0); + sorted_runs_ = CalculateSortedRuns(*vstorage_); + + if (sorted_runs_.size() == 0 || + (vstorage_->FilesMarkedForPeriodicCompaction().empty() && + vstorage_->FilesMarkedForCompaction().empty() && + sorted_runs_.size() < (unsigned int)mutable_cf_options_ + .level0_file_num_compaction_trigger)) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n", + cf_name_.c_str()); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER_MAX_SZ( + log_buffer_, 3072, + "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n", + cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); + + Compaction* c = nullptr; + // Periodic compaction has higher priority than other type of compaction + // because it's a hard requirement. + if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + // Always need to do a full compaction for periodic compaction. + c = PickPeriodicCompaction(); + } + + // Check for size amplification. + if (c == nullptr && + sorted_runs_.size() >= + static_cast<size_t>( + mutable_cf_options_.level0_file_num_compaction_trigger)) { + if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", + cf_name_.c_str()); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = + mutable_cf_options_.compaction_options_universal.size_ratio; + + if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for size ratio\n", + cf_name_.c_str()); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + // This is guaranteed by NeedsCompaction() + assert(sorted_runs_.size() >= + static_cast<size_t>( + mutable_cf_options_.level0_file_num_compaction_trigger)); + // Get the total number of sorted runs that are not being compacted + int num_sr_not_compacted = 0; + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false) { + num_sr_not_compacted++; + } + } + + // The number of sorted runs that are not being compacted is greater + // than the maximum allowed number of sorted runs + if (num_sr_not_compacted > + mutable_cf_options_.level0_file_num_compaction_trigger) { + unsigned int num_files = + num_sr_not_compacted - + mutable_cf_options_.level0_file_num_compaction_trigger + 1; + if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != + nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for file num -- %u\n", + cf_name_.c_str(), num_files); + } + } + } + } + } + + if (c == nullptr) { + if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: delete triggered compaction\n", + cf_name_.c_str()); + } + } + + if (c == nullptr) { + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + + if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == + true && + c->compaction_reason() != CompactionReason::kPeriodicCompaction) { + c->set_is_trivial_move(IsInputFilesNonOverlapping(c)); + } + +// validate that all the chosen files of L0 are non overlapping in time +#ifndef NDEBUG + bool is_first = true; + + size_t level_index = 0U; + if (c->start_level() == 0) { + for (auto f : *c->inputs(0)) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + } + } + level_index = 1U; + } + for (; level_index < c->num_input_levels(); level_index++) { + if (c->num_input_files(level_index) != 0) { + SequenceNumber smallest_seqno = 0U; + SequenceNumber largest_seqno = 0U; + GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno, + &largest_seqno); + if (is_first) { + is_first = false; + } + } + } +#endif + // update statistics + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files); + + picker_->RegisterCompaction(c); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + + TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", + c); + return c; +} + +uint32_t UniversalCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, uint64_t file_size) { + // Two conditions need to be satisfied: + // (1) the target path needs to be able to hold the file's size + // (2) Total size left in this and previous paths need to be not + // smaller than expected future file size before this new file is + // compacted, which is estimated based on size_ratio. + // For example, if now we are compacting files of size (1, 1, 2, 4, 8), + // we will make sure the target file, probably with size of 16, will be + // placed in a path so that eventually when new files are generated and + // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or + // before the path we chose. + // + // TODO(sdong): now the case of multiple column families is not + // considered in this algorithm. So the target size can be violated in + // that case. We need to improve it. + uint64_t accumulated_size = 0; + uint64_t future_size = + file_size * + (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100; + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + for (; p < ioptions.cf_paths.size() - 1; p++) { + uint64_t target_size = ioptions.cf_paths[p].target_size; + if (target_size > file_size && + accumulated_size + (target_size - file_size) > future_size) { + return p; + } + accumulated_size += target_size; + } + return p; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact) { + unsigned int min_merge_width = + mutable_cf_options_.compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + mutable_cf_options_.compaction_options_universal.max_merge_width; + + const SortedRun* sr = nullptr; + bool done = false; + size_t start_index = 0; + unsigned int candidate_count = 0; + + unsigned int max_files_to_compact = + std::min(max_merge_width, max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Caller checks the size before executing this function. This invariant is + // important because otherwise we may have a possible integer underflow when + // dealing with unsigned types. + assert(sorted_runs_.size() > 0); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (size_t loop = 0; loop < sorted_runs_.size(); loop++) { + candidate_count = 0; + + // Skip files that are already being compacted + for (sr = nullptr; loop < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + + if (!sr->being_compacted) { + candidate_count = 1; + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] being compacted, skipping", + cf_name_.c_str(), file_num_buf, loop); + + sr = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0; + if (sr != nullptr) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Possible candidate %s[%d].", + cf_name_.c_str(), file_num_buf, loop); + } + + // Check if the succeeding files need compaction. + for (size_t i = loop + 1; + candidate_count < max_files_to_compact && i < sorted_runs_.size(); + i++) { + const SortedRun* succeeding_sr = &sorted_runs_[i]; + if (succeeding_sr->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast<double>(succeeding_sr->size)) { + break; + } + if (mutable_cf_options_.compaction_options_universal.stop_style == + kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0; + if (sz < static_cast<double>(candidate_size)) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = succeeding_sr->compensated_file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += succeeding_sr->compensated_file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (size_t i = loop; + i < loop + candidate_count && i < sorted_runs_.size(); i++) { + const SortedRun* skipping_sr = &sorted_runs_[i]; + char file_num_buf[256]; + skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s", + cf_name_.c_str(), file_num_buf); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + size_t first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + mutable_cf_options_.compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = 0; + for (auto& sorted_run : sorted_runs_) { + total_size += sorted_run.compensated_file_size; + } + + uint64_t older_file_size = 0; + for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) { + older_file_size += sorted_runs_[i].size; + if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { + enable_compression = false; + break; + } + } + } + + uint64_t estimated_total_size = 0; + for (unsigned int i = 0; i < first_index_after; i++) { + estimated_total_size += sorted_runs_[i].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + int output_level; + if (first_index_after == sorted_runs_.size()) { + output_level = vstorage_->num_levels() - 1; + } else if (sorted_runs_[first_index_after].level == 0) { + output_level = 0; + } else { + output_level = sorted_runs_[first_index_after].level - 1; + } + + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind && + (output_level == vstorage_->num_levels() - 1)) { + assert(output_level > 1); + output_level--; + } + + std::vector<CompactionInputFiles> inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast<int>(i); + } + for (size_t i = start_index; i < first_index_after; i++) { + auto& picking_sr = sorted_runs_[i]; + if (picking_sr.level == 0) { + FileMetaData* picking_file = picking_sr.file; + inputs[0].files.push_back(picking_file); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s", + cf_name_.c_str(), file_num_buf); + } + + std::vector<FileMetaData*> grandparents; + // Include grandparents for potential file cutting in incremental + // mode. It is for aligning file cutting boundaries across levels, + // so that subsequent compactions can pick files with aligned + // buffer. + // Single files are only picked up in incremental mode, so that + // there is no need for full range. + if (mutable_cf_options_.compaction_options_universal.incremental && + first_index_after < sorted_runs_.size() && + sorted_runs_[first_index_after].level > 1) { + grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level); + } + + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + CompactionReason compaction_reason; + if (max_number_of_files_to_compact == UINT_MAX) { + compaction_reason = CompactionReason::kUniversalSizeRatio; + } else { + compaction_reason = CompactionReason::kUniversalSortedRunNum; + } + return new Compaction(vstorage_, ioptions_, mutable_cf_options_, + mutable_db_options_, std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, + output_level, 1, enable_compression), + GetCompressionOptions(mutable_cf_options_, vstorage_, + output_level, enable_compression), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, + /* is manual */ false, /* trim_ts */ "", score_, + false /* deletion_compaction */, + /* l0_files_might_overlap */ true, compaction_reason); +} + +// Look at overall size amplification. If size amplification +// exceeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { + // percentage flexibility while reducing size amplification + uint64_t ratio = mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + size_t start_index = 0; + const SortedRun* sr = nullptr; + + assert(!sorted_runs_.empty()); + if (sorted_runs_.back().being_compacted) { + return nullptr; + } + + // Skip files that are already being compacted + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + if (!sr->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: skipping %s[%d] compacted %s", + cf_name_.c_str(), file_num_buf, loop, + " cannot be a candidate to reduce size amp.\n"); + sr = nullptr; + } + + if (sr == nullptr) { + return nullptr; // no candidate files + } + { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", + cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); + } + + // size of the base sorted run for size amp calculation + uint64_t base_sr_size = sorted_runs_.back().size; + size_t sr_end_idx = sorted_runs_.size() - 1; + // If tiered compaction is enabled and the last sorted run is the last level + if (ioptions_.preclude_last_level_data_seconds > 0 && + ioptions_.num_levels > 2 && + sorted_runs_.back().level == ioptions_.num_levels - 1 && + sorted_runs_.size() > 1) { + sr_end_idx = sorted_runs_.size() - 2; + base_sr_size = sorted_runs_[sr_end_idx].size; + } + + // keep adding up all the remaining files + for (size_t loop = start_index; loop < sr_end_idx; loop++) { + sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + // TODO with incremental compaction is supported, we might want to + // schedule some incremental compactions in parallel if needed. + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", + cf_name_.c_str(), file_num_buf, start_index, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += sr->compensated_file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * base_sr_size) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); + return nullptr; + } else { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); + } + // Since incremental compaction can't include more than second last + // level, it can introduce penalty, compared to full compaction. We + // hard code the pentalty to be 80%. If we end up with a compaction + // fanout higher than 80% of full level compactions, we fall back + // to full level compaction. + // The 80% threshold is arbitrary and can be adjusted or made + // configurable in the future. + // This also prevent the case when compaction falls behind and we + // need to compact more levels for compactions to catch up. + if (mutable_cf_options_.compaction_options_universal.incremental) { + double fanout_threshold = static_cast<double>(base_sr_size) / + static_cast<double>(candidate_size) * 1.8; + Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold); + if (picked != nullptr) { + // As the feature is still incremental, picking incremental compaction + // might fail and we will fall bck to compacting full level. + return picked; + } + } + return PickCompactionWithSortedRunRange( + start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification); +} + +Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( + double fanout_threshold) { + // Try find all potential compactions with total size just over + // options.max_compaction_size / 2, and take the one with the lowest + // fanout (defined in declaration of the function). + // This is done by having a sliding window of the files at the second + // lowest level, and keep expanding while finding overlapping in the + // last level. Once total size exceeds the size threshold, calculate + // the fanout value. And then shrinking from the small side of the + // window. Keep doing it until the end. + // Finally, we try to include upper level files if they fall into + // the range. + // + // Note that it is a similar problem as leveled compaction's + // kMinOverlappingRatio priority, but instead of picking single files + // we expand to a target compaction size. The reason is that in + // leveled compaction, actual fanout value tends to high, e.g. 10, so + // even with single file in down merging level, the extra size + // compacted in boundary files is at a lower ratio. But here users + // often have size of second last level size to be 1/4, 1/3 or even + // 1/2 of the bottommost level, so picking single file in second most + // level will cause significant waste, which is not desirable. + // + // This algorithm has lots of room to improve to pick more efficient + // compactions. + assert(sorted_runs_.size() >= 2); + int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level; + if (second_last_level == 0) { + // Can't split Level 0. + return nullptr; + } + int output_level = sorted_runs_.back().level; + const std::vector<FileMetaData*>& bottom_files = + vstorage_->LevelFiles(output_level); + const std::vector<FileMetaData*>& files = + vstorage_->LevelFiles(second_last_level); + assert(!bottom_files.empty()); + assert(!files.empty()); + + // std::unordered_map<uint64_t, uint64_t> file_to_order; + + int picked_start_idx = 0; + int picked_end_idx = 0; + double picked_fanout = fanout_threshold; + + // Use half target compaction bytes as anchor to stop growing second most + // level files, and reserve growing space for more overlapping bottom level, + // clean cut, files from other levels, etc. + uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2; + int start_idx = 0; + int bottom_end_idx = 0; + int bottom_start_idx = 0; + uint64_t non_bottom_size = 0; + uint64_t bottom_size = 0; + bool end_bottom_size_counted = false; + for (int end_idx = 0; end_idx < static_cast<int>(files.size()); end_idx++) { + FileMetaData* end_file = files[end_idx]; + + // Include bottom most level files smaller than the current second + // last level file. + int num_skipped = 0; + while (bottom_end_idx < static_cast<int>(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->smallest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + } + bottom_end_idx++; + end_bottom_size_counted = false; + num_skipped++; + } + + if (num_skipped > 1) { + // At least a file in the bottom most level falls into the file gap. No + // reason to include the file. We cut the range and start a new sliding + // window. + start_idx = end_idx; + } + + if (start_idx == end_idx) { + // new sliding window. + non_bottom_size = 0; + bottom_size = 0; + bottom_start_idx = bottom_end_idx; + end_bottom_size_counted = false; + } + + non_bottom_size += end_file->fd.file_size; + + // Include all overlapping files in bottom level. + while (bottom_end_idx < static_cast<int>(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->smallest, + end_file->largest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + end_bottom_size_counted = true; + } + if (icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + bottom_end_idx++; + end_bottom_size_counted = false; + } + + if ((non_bottom_size + bottom_size > comp_thres_size || + end_idx == static_cast<int>(files.size()) - 1) && + non_bottom_size > 0) { // Do we alow 0 size file at all? + // If it is a better compaction, remember it in picked* variables. + double fanout = static_cast<double>(bottom_size) / + static_cast<double>(non_bottom_size); + if (fanout < picked_fanout) { + picked_start_idx = start_idx; + picked_end_idx = end_idx; + picked_fanout = fanout; + } + // Shrink from the start end to under comp_thres_size + while (non_bottom_size + bottom_size > comp_thres_size && + start_idx <= end_idx) { + non_bottom_size -= files[start_idx]->fd.file_size; + start_idx++; + if (start_idx < static_cast<int>(files.size())) { + while (bottom_start_idx <= bottom_end_idx && + icmp_->Compare(bottom_files[bottom_start_idx]->largest, + files[start_idx]->smallest) < 0) { + bottom_size -= bottom_files[bottom_start_idx]->fd.file_size; + bottom_start_idx++; + } + } + } + } + } + + if (picked_fanout >= fanout_threshold) { + assert(picked_fanout == fanout_threshold); + return nullptr; + } + + std::vector<CompactionInputFiles> inputs; + CompactionInputFiles bottom_level_inputs; + CompactionInputFiles second_last_level_inputs; + second_last_level_inputs.level = second_last_level; + bottom_level_inputs.level = output_level; + for (int i = picked_start_idx; i <= picked_end_idx; i++) { + if (files[i]->being_compacted) { + return nullptr; + } + second_last_level_inputs.files.push_back(files[i]); + } + assert(!second_last_level_inputs.empty()); + if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &second_last_level_inputs, + /*next_smallest=*/nullptr)) { + return nullptr; + } + // We might be able to avoid this binary search if we save and expand + // from bottom_start_idx and bottom_end_idx, but for now, we use + // SetupOtherInputs() for simplicity. + int parent_index = -1; // Create and use bottom_start_idx? + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &second_last_level_inputs, + &bottom_level_inputs, &parent_index, + /*base_index=*/-1)) { + return nullptr; + } + + // Try to include files in upper levels if they fall into the range. + // Since we need to go from lower level up and this is in the reverse + // order, compared to level order, we first write to an reversed + // data structure and finally copy them to compaction inputs. + InternalKey smallest, largest; + picker_->GetRange(second_last_level_inputs, &smallest, &largest); + std::vector<CompactionInputFiles> inputs_reverse; + for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) { + SortedRun& sr = *it; + if (sr.level == 0) { + break; + } + std::vector<FileMetaData*> level_inputs; + vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest, + &level_inputs); + if (!level_inputs.empty()) { + inputs_reverse.push_back({}); + inputs_reverse.back().level = sr.level; + inputs_reverse.back().files = level_inputs; + picker_->GetRange(inputs_reverse.back(), &smallest, &largest); + } + } + for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) { + inputs.push_back(*it); + } + + inputs.push_back(second_last_level_inputs); + inputs.push_back(bottom_level_inputs); + + int start_level = Compaction::kInvalidLevel; + for (const auto& in : inputs) { + if (!in.empty()) { + // inputs should already be sorted by level + start_level = in.level; + break; + } + } + + // intra L0 compactions outputs could have overlap + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + + // TODO support multi paths? + uint32_t path_id = 0; + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1, + true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, + CompactionReason::kUniversalSizeAmplification); +} + +// Pick files marked for compaction. Typically, files are marked by +// CompactOnDeleteCollector due to the presence of tombstones. +Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { + CompactionInputFiles start_level_inputs; + int output_level; + std::vector<CompactionInputFiles> inputs; + std::vector<FileMetaData*> grandparents; + + if (vstorage_->num_levels() == 1) { + // This is single level universal. Since we're basically trying to reclaim + // space by processing files marked for compaction due to high tombstone + // density, let's do the same thing as compaction to reduce size amp which + // has the same goals. + int start_index = -1; + + start_level_inputs.level = 0; + start_level_inputs.files.clear(); + output_level = 0; + // Find the first file marked for compaction. Ignore the last file + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + continue; + } + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + if (f->marked_for_compaction) { + start_level_inputs.files.push_back(f); + start_index = + static_cast<int>(loop); // Consider this as the first candidate. + break; + } + } + if (start_index < 0) { + // Either no file marked, or they're already being compacted + return nullptr; + } + + for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + break; + } + + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + start_level_inputs.files.push_back(f); + } + if (start_level_inputs.size() <= 1) { + // If only the last file in L0 is marked for compaction, ignore it + return nullptr; + } + inputs.push_back(start_level_inputs); + } else { + int start_level; + + // For multi-level universal, the strategy is to make this look more like + // leveled. We pick one of the files marked for compaction and compact with + // overlapping files in the adjacent level. + picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, + &output_level, &start_level_inputs); + if (start_level_inputs.empty()) { + return nullptr; + } + + // Pick the first non-empty level after the start_level + for (output_level = start_level + 1; output_level < vstorage_->num_levels(); + output_level++) { + if (vstorage_->NumLevelFiles(output_level) != 0) { + break; + } + } + + // If all higher levels are empty, pick the highest level as output level + if (output_level == vstorage_->num_levels()) { + if (start_level == 0) { + output_level = vstorage_->num_levels() - 1; + } else { + // If start level is non-zero and all higher levels are empty, this + // compaction will translate into a trivial move. Since the idea is + // to reclaim space and trivial move doesn't help with that, we + // skip compaction in this case and return nullptr + return nullptr; + } + } + if (ioptions_.allow_ingest_behind && + output_level == vstorage_->num_levels() - 1) { + assert(output_level > 1); + output_level--; + } + + if (output_level != 0) { + if (start_level == 0) { + if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, + output_level, nullptr)) { + return nullptr; + } + } + + CompactionInputFiles output_level_inputs; + int parent_index = -1; + + output_level_inputs.level = output_level; + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &start_level_inputs, &output_level_inputs, + &parent_index, -1)) { + return nullptr; + } + inputs.push_back(start_level_inputs); + if (!output_level_inputs.empty()) { + inputs.push_back(output_level_inputs); + } + if (picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level, output_level))) { + return nullptr; + } + + picker_->GetGrandparents(vstorage_, start_level_inputs, + output_level_inputs, &grandparents); + } else { + inputs.push_back(start_level_inputs); + } + } + + uint64_t estimated_total_size = 0; + // Use size of the output level as estimated file size + for (FileMetaData* f : vstorage_->LevelFiles(output_level)) { + estimated_total_size += f->fd.GetFileSize(); + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, + CompactionReason::kFilesMarkedForCompaction); +} + +Compaction* UniversalCompactionBuilder::PickCompactionToOldest( + size_t start_index, CompactionReason compaction_reason) { + return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1, + compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( + size_t start_index, size_t end_index, CompactionReason compaction_reason) { + assert(start_index < sorted_runs_.size()); + + // Estimate total file size + uint64_t estimated_total_size = 0; + for (size_t loop = start_index; loop <= end_index; loop++) { + estimated_total_size += sorted_runs_[loop].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + + std::vector<CompactionInputFiles> inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast<int>(i); + } + for (size_t loop = start_index; loop <= end_index; loop++) { + auto& picking_sr = sorted_runs_[loop]; + if (picking_sr.level == 0) { + FileMetaData* f = picking_sr.file; + inputs[0].files.push_back(f); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + std::string comp_reason_print_string; + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + comp_reason_print_string = "periodic compaction"; + } else if (compaction_reason == + CompactionReason::kUniversalSizeAmplification) { + comp_reason_print_string = "size amp"; + } else { + assert(false); + comp_reason_print_string = "unknown: "; + comp_reason_print_string.append( + std::to_string(static_cast<int>(compaction_reason))); + } + + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s", + cf_name_.c_str(), comp_reason_print_string.c_str(), + file_num_buf); + } + + int output_level; + if (end_index == sorted_runs_.size() - 1) { + // output files at the last level, unless it's reserved + output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + } else { + // if it's not including all sorted_runs, it can only output to the level + // above the `end_index + 1` sorted_run. + output_level = sorted_runs_[end_index + 1].level - 1; + } + + // intra L0 compactions outputs could have overlap + if (output_level != 0 && + picker_->FilesRangeOverlapWithCompaction( + inputs, output_level, + Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, + start_level, output_level))) { + return nullptr; + } + + // We never check size for + // compaction_options_universal.compression_size_percent, + // because we always compact all the files, so always compress. + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1, + true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* trim_ts */ "", score_, false /* deletion_compaction */, + /* l0_files_might_overlap */ true, compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction", + cf_name_.c_str()); + + // In universal compaction, sorted runs contain older data are almost always + // generated earlier too. To simplify the problem, we just try to trigger + // a full compaction. We start from the oldest sorted run and include + // all sorted runs, until we hit a sorted already being compacted. + // Since usually the largest (which is usually the oldest) sorted run is + // included anyway, doing a full compaction won't increase write + // amplification much. + + // Get some information from marked files to check whether a file is + // included in the compaction. + + size_t start_index = sorted_runs_.size(); + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + start_index--; + } + if (start_index == sorted_runs_.size()) { + return nullptr; + } + + // There is a rare corner case where we can't pick up all the files + // because some files are being compacted and we end up with picking files + // but none of them need periodic compaction. Unless we simply recompact + // the last sorted run (either the last level or last L0 file), we would just + // execute the compaction, in order to simplify the logic. + if (start_index == sorted_runs_.size() - 1) { + bool included_file_marked = false; + int start_level = sorted_runs_[start_index].level; + FileMetaData* start_file = sorted_runs_[start_index].file; + for (const std::pair<int, FileMetaData*>& level_file_pair : + vstorage_->FilesMarkedForPeriodicCompaction()) { + if (start_level != 0) { + // Last sorted run is a level + if (start_level == level_file_pair.first) { + included_file_marked = true; + break; + } + } else { + // Last sorted run is a L0 file. + if (start_file == level_file_pair.second) { + included_file_marked = true; + break; + } + } + } + if (!included_file_marked) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Cannot form a compaction covering file " + "marked for periodic compaction", + cf_name_.c_str()); + return nullptr; + } + } + + Compaction* c = PickCompactionToOldest(start_index, + CompactionReason::kPeriodicCompaction); + + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", c); + + return c; +} + +uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const { + if (!mutable_cf_options_.compaction_options_universal.incremental) { + return std::numeric_limits<uint64_t>::max(); + } else { + // Try to align cutting boundary with files at the next level if the + // file isn't end up with 1/2 of target size, or it would overlap + // with two full size files at the next level. + return mutable_cf_options_.target_file_size_base / 2 * 3; + } +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h new file mode 100644 index 000000000..5f897cc9b --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const ImmutableOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_service_job.cc b/src/rocksdb/db/compaction/compaction_service_job.cc new file mode 100644 index 000000000..1d2e99d99 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_service_job.cc @@ -0,0 +1,829 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_job.h" +#include "db/compaction/compaction_state.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/utilities/options_type.h" + +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { +class SubcompactionState; + +CompactionServiceJobStatus +CompactionJob::ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + assert(db_options_.compaction_service); + + const Compaction* compaction = sub_compact->compaction; + CompactionServiceInput compaction_input; + compaction_input.output_level = compaction->output_level(); + compaction_input.db_id = db_id_; + + const std::vector<CompactionInputFiles>& inputs = + *(compact_->compaction->inputs()); + for (const auto& files_per_level : inputs) { + for (const auto& file : files_per_level.files) { + compaction_input.input_files.emplace_back( + MakeTableFileName(file->fd.GetNumber())); + } + } + compaction_input.column_family.name = + compaction->column_family_data()->GetName(); + compaction_input.column_family.options = + compaction->column_family_data()->GetLatestCFOptions(); + compaction_input.db_options = + BuildDBOptions(db_options_, mutable_db_options_copy_); + compaction_input.snapshots = existing_snapshots_; + compaction_input.has_begin = sub_compact->start.has_value(); + compaction_input.begin = + compaction_input.has_begin ? sub_compact->start->ToString() : ""; + compaction_input.has_end = sub_compact->end.has_value(); + compaction_input.end = + compaction_input.has_end ? sub_compact->end->ToString() : ""; + + std::string compaction_input_binary; + Status s = compaction_input.Write(&compaction_input_binary); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + std::ostringstream input_files_oss; + bool is_first_one = true; + for (const auto& file : compaction_input.input_files) { + input_files_oss << (is_first_one ? "" : ", ") << file; + is_first_one = false; + } + + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_input.output_level, input_files_oss.str().c_str()); + CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, + GetCompactionId(sub_compact), thread_pri_); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->StartV2(info, compaction_input_binary); + switch (compaction_status) { + case CompactionServiceJobStatus::kSuccess: + break; + case CompactionServiceJobStatus::kFailure: + sub_compact->status = Status::Incomplete( + "CompactionService failed to start compaction job."); + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed to start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + case CompactionServiceJobStatus::kUseLocal: + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API Start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + default: + assert(false); // unknown status + break; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Waiting for remote compaction...", + compaction_input.column_family.name.c_str(), job_id_); + std::string compaction_result_binary; + compaction_status = db_options_.compaction_service->WaitForCompleteV2( + info, &compaction_result_binary); + + if (compaction_status == CompactionServiceJobStatus::kUseLocal) { + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API " + "WaitForComplete.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + CompactionServiceResult compaction_result; + s = CompactionServiceResult::Read(compaction_result_binary, + &compaction_result); + + if (compaction_status == CompactionServiceJobStatus::kFailure) { + if (s.ok()) { + if (compaction_result.status.ok()) { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (even though " + "the internal status is okay)."); + } else { + // set the current sub compaction status with the status returned from + // remote + sub_compact->status = compaction_result.status; + } + } else { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (and no valid " + "result is returned)."); + compaction_result.status.PermitUncheckedError(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + if (!s.ok()) { + sub_compact->status = s; + compaction_result.status.PermitUncheckedError(); + return CompactionServiceJobStatus::kFailure; + } + sub_compact->status = compaction_result.status; + + std::ostringstream output_files_oss; + is_first_one = true; + for (const auto& file : compaction_result.output_files) { + output_files_oss << (is_first_one ? "" : ", ") << file.file_name; + is_first_one = false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Receive remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), + output_files_oss.str().c_str()); + + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + for (const auto& file : compaction_result.output_files) { + uint64_t file_num = versions_->NewFileNumber(); + auto src_file = compaction_result.output_path + "/" + file.file_name; + auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, + file_num, compaction->output_path_id()); + s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + FileMetaData meta; + uint64_t file_size; + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, + file.smallest_seqno, file.largest_seqno); + meta.smallest.DecodeFrom(file.smallest_internal_key); + meta.largest.DecodeFrom(file.largest_internal_key); + meta.oldest_ancester_time = file.oldest_ancester_time; + meta.file_creation_time = file.file_creation_time; + meta.marked_for_compaction = file.marked_for_compaction; + meta.unique_id = file.unique_id; + + auto cfd = compaction->column_family_data(); + sub_compact->Current().AddOutput(std::move(meta), + cfd->internal_comparator(), false, false, + true, file.paranoid_hash); + } + sub_compact->compaction_job_stats = compaction_result.stats; + sub_compact->Current().SetNumOutputRecords( + compaction_result.num_output_records); + sub_compact->Current().SetTotalBytes(compaction_result.total_bytes); + RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); + RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, + compaction_result.bytes_written); + return CompactionServiceJobStatus::kSuccess; +} + +std::string CompactionServiceCompactionJob::GetTableFileName( + uint64_t file_number) { + return MakeTableFileName(output_path_, file_number); +} + +void CompactionServiceCompactionJob::RecordCompactionIOStats() { + compaction_result_->bytes_read += IOSTATS(bytes_read); + compaction_result_->bytes_written += IOSTATS(bytes_written); + CompactionJob::RecordCompactionIOStats(); +} + +CompactionServiceCompactionJob::CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic<bool>* shutting_down, + LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector<SequenceNumber> existing_snapshots, + std::shared_ptr<Cache> table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer, + const std::atomic<bool>& manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result) + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, + std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr, + std::move(table_cache), event_logger, + compaction->mutable_cf_options()->paranoid_file_checks, + compaction->mutable_cf_options()->report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + manual_compaction_canceled, db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), + output_path_(std::move(output_path)), + compaction_input_(compaction_service_input), + compaction_result_(compaction_service_result) {} + +Status CompactionServiceCompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + Slice begin = compaction_input_.begin; + Slice end = compaction_input_.end; + compact_->sub_compact_states.emplace_back( + c, + compaction_input_.has_begin ? std::optional<Slice>(begin) + : std::optional<Slice>(), + compaction_input_.has_end ? std::optional<Slice>(end) + : std::optional<Slice>(), + /*sub_job_id*/ 0); + + log_buffer_->FlushBufferToLog(); + LogCompaction(); + const uint64_t start_micros = db_options_.clock->NowMicros(); + // Pick the only sub-compaction we should have + assert(compact_->sub_compact_states.size() == 1); + SubcompactionState* sub_compact = compact_->sub_compact_states.data(); + + ProcessKeyValueCompaction(sub_compact); + + compaction_stats_.stats.micros = + db_options_.clock->NowMicros() - start_micros; + compaction_stats_.stats.cpu_micros = + sub_compact->compaction_job_stats.cpu_micros; + + RecordTimeToHistogram(stats_, COMPACTION_TIME, + compaction_stats_.stats.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.stats.cpu_micros); + + Status status = sub_compact->status; + IOStatus io_s = sub_compact->io_status; + + if (io_status_.ok()) { + io_status_ = io_s; + } + + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, + DirFsyncOptions()); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + // TODO: Add verify_table() + } + + // Finish up all book-keeping to unify the subcompaction results + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + UpdateCompactionStats(); + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + compact_->status = status; + compact_->status.PermitUncheckedError(); + + // Build compaction result + compaction_result_->output_level = compact_->compaction->output_level(); + compaction_result_->output_path = output_path_; + for (const auto& output_file : sub_compact->GetOutputs()) { + auto& meta = output_file.meta; + compaction_result_->output_files.emplace_back( + MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.smallest.Encode().ToString(), + meta.largest.Encode().ToString(), meta.oldest_ancester_time, + meta.file_creation_time, output_file.validator.GetHash(), + meta.marked_for_compaction, meta.unique_id); + } + InternalStats::CompactionStatsFull compaction_stats; + sub_compact->AggregateCompactionStats(compaction_stats); + compaction_result_->num_output_records = + compaction_stats.stats.num_output_records; + compaction_result_->total_bytes = compaction_stats.TotalBytesWritten(); + + return status; +} + +void CompactionServiceCompactionJob::CleanupCompaction() { + CompactionJob::CleanupCompaction(); +} + +// Internal binary format for the input and result data +enum BinaryFormatVersion : uint32_t { + kOptionsString = 1, // Use string format similar to Option string format +}; + +static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = { + {"name", + {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options", + {offsetof(struct ColumnFamilyDescriptor, options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto cf_options = static_cast<ColumnFamilyOptions*>(addr); + return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), + value, cf_options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr); + std::string result; + auto status = + GetStringFromColumnFamilyOptions(opts, *cf_options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1); + const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2); + auto this_conf = CFOptionsAsConfigurable(*this_one); + auto that_conf = CFOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, +}; + +static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = { + {"column_family", + OptionTypeInfo::Struct( + "column_family", &cfd_type_info, + offsetof(struct CompactionServiceInput, column_family), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"db_options", + {offsetof(struct CompactionServiceInput, db_options), + OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto options = static_cast<DBOptions*>(addr); + return GetDBOptionsFromString(opts, DBOptions(), value, options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto options = static_cast<const DBOptions*>(addr); + std::string result; + auto status = GetStringFromDBOptions(opts, *options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast<const DBOptions*>(addr1); + const auto that_one = static_cast<const DBOptions*>(addr2); + auto this_conf = DBOptionsAsConfigurable(*this_one); + auto that_conf = DBOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, + {"snapshots", OptionTypeInfo::Vector<uint64_t>( + offsetof(struct CompactionServiceInput, snapshots), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, + {"input_files", OptionTypeInfo::Vector<std::string>( + offsetof(struct CompactionServiceInput, input_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kEncodedString})}, + {"output_level", + {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"db_id", + {offsetof(struct CompactionServiceInput, db_id), + OptionType::kEncodedString}}, + {"has_begin", + {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"begin", + {offsetof(struct CompactionServiceInput, begin), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"has_end", + {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"end", + {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map<std::string, OptionTypeInfo> + cs_output_file_type_info = { + {"file_name", + {offsetof(struct CompactionServiceOutputFile, file_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_seqno", + {offsetof(struct CompactionServiceOutputFile, smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_seqno", + {offsetof(struct CompactionServiceOutputFile, largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_internal_key", + {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_internal_key", + {offsetof(struct CompactionServiceOutputFile, largest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"oldest_ancester_time", + {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct CompactionServiceOutputFile, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_hash", + {offsetof(struct CompactionServiceOutputFile, paranoid_hash), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"marked_for_compaction", + {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"unique_id", + OptionTypeInfo::Array<uint64_t, 2>( + offsetof(struct CompactionServiceOutputFile, unique_id), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, +}; + +static std::unordered_map<std::string, OptionTypeInfo> + compaction_job_stats_type_info = { + {"elapsed_micros", + {offsetof(struct CompactionJobStats, elapsed_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct CompactionJobStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_blobs_read", + {offsetof(struct CompactionJobStats, num_blobs_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files", + {offsetof(struct CompactionJobStats, num_input_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_at_output_level", + {offsetof(struct CompactionJobStats, num_input_files_at_output_level), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionJobStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct CompactionJobStats, num_output_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct CompactionJobStats, num_output_files_blob), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_full_compaction", + {offsetof(struct CompactionJobStats, is_full_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_manual_compaction", + {offsetof(struct CompactionJobStats, is_manual_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_bytes", + {offsetof(struct CompactionJobStats, total_input_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_blob_bytes_read", + {offsetof(struct CompactionJobStats, total_blob_bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes", + {offsetof(struct CompactionJobStats, total_output_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes_blob", + {offsetof(struct CompactionJobStats, total_output_bytes_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_records_replaced", + {offsetof(struct CompactionJobStats, num_records_replaced), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_key_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_value_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_deletion_records", + {offsetof(struct CompactionJobStats, num_input_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_expired_deletion_records", + {offsetof(struct CompactionJobStats, num_expired_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_corrupt_keys", + {offsetof(struct CompactionJobStats, num_corrupt_keys), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_write_nanos", + {offsetof(struct CompactionJobStats, file_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_range_sync_nanos", + {offsetof(struct CompactionJobStats, file_range_sync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_fsync_nanos", + {offsetof(struct CompactionJobStats, file_fsync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_prepare_write_nanos", + {offsetof(struct CompactionJobStats, file_prepare_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_output_key_prefix", + {offsetof(struct CompactionJobStats, smallest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_output_key_prefix", + {offsetof(struct CompactionJobStats, largest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_fallthru", + {offsetof(struct CompactionJobStats, num_single_del_fallthru), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_mismatch", + {offsetof(struct CompactionJobStats, num_single_del_mismatch), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +namespace { +// this is a helper struct to serialize and deserialize class Status, because +// Status's members are not public. +struct StatusSerializationAdapter { + uint8_t code; + uint8_t subcode; + uint8_t severity; + std::string message; + + StatusSerializationAdapter() = default; + explicit StatusSerializationAdapter(const Status& s) { + code = s.code(); + subcode = s.subcode(); + severity = s.severity(); + auto msg = s.getState(); + message = msg ? msg : ""; + } + + Status GetStatus() const { + return Status{static_cast<Status::Code>(code), + static_cast<Status::SubCode>(subcode), + static_cast<Status::Severity>(severity), message}; + } +}; +} // namespace + +static std::unordered_map<std::string, OptionTypeInfo> + status_adapter_type_info = { + {"code", + {offsetof(struct StatusSerializationAdapter, code), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"subcode", + {offsetof(struct StatusSerializationAdapter, subcode), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"severity", + {offsetof(struct StatusSerializationAdapter, severity), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"message", + {offsetof(struct StatusSerializationAdapter, message), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = { + {"status", + {offsetof(struct CompactionServiceResult, status), + OptionType::kCustomizable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto status_obj = static_cast<Status*>(addr); + StatusSerializationAdapter adapter; + Status s = OptionTypeInfo::ParseType( + opts, value, status_adapter_type_info, &adapter); + *status_obj = adapter.GetStatus(); + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto status_obj = static_cast<const Status*>(addr); + StatusSerializationAdapter adapter(*status_obj); + std::string result; + Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, + &adapter, &result); + *value = "{" + result + "}"; + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr1, const void* addr2, std::string* mismatch) { + const auto status1 = static_cast<const Status*>(addr1); + const auto status2 = static_cast<const Status*>(addr2); + + StatusSerializationAdapter adatper1(*status1); + StatusSerializationAdapter adapter2(*status2); + return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, + &adatper1, &adapter2, mismatch); + }}}, + {"output_files", + OptionTypeInfo::Vector<CompactionServiceOutputFile>( + offsetof(struct CompactionServiceResult, output_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone))}, + {"output_level", + {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"output_path", + {offsetof(struct CompactionServiceResult, output_path), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionServiceResult, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_bytes", + {offsetof(struct CompactionServiceResult, total_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read", + {offsetof(struct CompactionServiceResult, bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct CompactionServiceResult, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"stats", OptionTypeInfo::Struct( + "stats", &compaction_job_stats_type_info, + offsetof(struct CompactionServiceResult, stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + +Status CompactionServiceInput::Read(const std::string& data_str, + CompactionServiceInput* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceInput string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Input data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceInput::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); +} + +Status CompactionServiceResult::Read(const std::string& data_str, + CompactionServiceResult* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceResult string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Result data version not supported: " + + std::to_string(format_version)); + } +} + +Status CompactionServiceResult::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); +} + +#ifndef NDEBUG +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, + mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, + mismatch); +} +#endif // NDEBUG +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_service_test.cc b/src/rocksdb/db/compaction/compaction_service_test.cc new file mode 100644 index 000000000..c475c4e3b --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_service_test.cc @@ -0,0 +1,966 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "table/unique_id_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class MyTestCompactionService : public CompactionService { + public: + MyTestCompactionService( + std::string db_path, Options& options, + std::shared_ptr<Statistics>& statistics, + std::vector<std::shared_ptr<EventListener>>& listeners, + std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> + table_properties_collector_factories) + : db_path_(std::move(db_path)), + options_(options), + statistics_(statistics), + start_info_("na", "na", "na", 0, Env::TOTAL), + wait_info_("na", "na", "na", 0, Env::TOTAL), + listeners_(listeners), + table_properties_collector_factories_( + std::move(table_properties_collector_factories)) {} + + static const char* kClassName() { return "MyTestCompactionService"; } + + const char* Name() const override { return kClassName(); } + + CompactionServiceJobStatus StartV2( + const CompactionServiceJobInfo& info, + const std::string& compaction_service_input) override { + InstrumentedMutexLock l(&mutex_); + start_info_ = info; + assert(info.db_name == db_path_); + jobs_.emplace(info.job_id, compaction_service_input); + CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess; + if (is_override_start_status_) { + return override_start_status_; + } + return s; + } + + CompactionServiceJobStatus WaitForCompleteV2( + const CompactionServiceJobInfo& info, + std::string* compaction_service_result) override { + std::string compaction_input; + assert(info.db_name == db_path_); + { + InstrumentedMutexLock l(&mutex_); + wait_info_ = info; + auto i = jobs_.find(info.job_id); + if (i == jobs_.end()) { + return CompactionServiceJobStatus::kFailure; + } + compaction_input = std::move(i->second); + jobs_.erase(i); + } + + if (is_override_wait_status_) { + return override_wait_status_; + } + + CompactionServiceOptionsOverride options_override; + options_override.env = options_.env; + options_override.file_checksum_gen_factory = + options_.file_checksum_gen_factory; + options_override.comparator = options_.comparator; + options_override.merge_operator = options_.merge_operator; + options_override.compaction_filter = options_.compaction_filter; + options_override.compaction_filter_factory = + options_.compaction_filter_factory; + options_override.prefix_extractor = options_.prefix_extractor; + options_override.table_factory = options_.table_factory; + options_override.sst_partitioner_factory = options_.sst_partitioner_factory; + options_override.statistics = statistics_; + if (!listeners_.empty()) { + options_override.listeners = listeners_; + } + + if (!table_properties_collector_factories_.empty()) { + options_override.table_properties_collector_factories = + table_properties_collector_factories_; + } + + OpenAndCompactOptions options; + options.canceled = &canceled_; + + Status s = DB::OpenAndCompact( + options, db_path_, db_path_ + "/" + std::to_string(info.job_id), + compaction_input, compaction_service_result, options_override); + if (is_override_wait_result_) { + *compaction_service_result = override_wait_result_; + } + compaction_num_.fetch_add(1); + if (s.ok()) { + return CompactionServiceJobStatus::kSuccess; + } else { + return CompactionServiceJobStatus::kFailure; + } + } + + int GetCompactionNum() { return compaction_num_.load(); } + + CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; } + CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; } + + void OverrideStartStatus(CompactionServiceJobStatus s) { + is_override_start_status_ = true; + override_start_status_ = s; + } + + void OverrideWaitStatus(CompactionServiceJobStatus s) { + is_override_wait_status_ = true; + override_wait_status_ = s; + } + + void OverrideWaitResult(std::string str) { + is_override_wait_result_ = true; + override_wait_result_ = std::move(str); + } + + void ResetOverride() { + is_override_wait_result_ = false; + is_override_start_status_ = false; + is_override_wait_status_ = false; + } + + void SetCanceled(bool canceled) { canceled_ = canceled; } + + private: + InstrumentedMutex mutex_; + std::atomic_int compaction_num_{0}; + std::map<uint64_t, std::string> jobs_; + const std::string db_path_; + Options options_; + std::shared_ptr<Statistics> statistics_; + CompactionServiceJobInfo start_info_; + CompactionServiceJobInfo wait_info_; + bool is_override_start_status_ = false; + CompactionServiceJobStatus override_start_status_ = + CompactionServiceJobStatus::kFailure; + bool is_override_wait_status_ = false; + CompactionServiceJobStatus override_wait_status_ = + CompactionServiceJobStatus::kFailure; + bool is_override_wait_result_ = false; + std::string override_wait_result_; + std::vector<std::shared_ptr<EventListener>> listeners_; + std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> + table_properties_collector_factories_; + std::atomic_bool canceled_{false}; +}; + +class CompactionServiceTest : public DBTestBase { + public: + explicit CompactionServiceTest() + : DBTestBase("compaction_service_test", true) {} + + protected: + void ReopenWithCompactionService(Options* options) { + options->env = env_; + primary_statistics_ = CreateDBStatistics(); + options->statistics = primary_statistics_; + compactor_statistics_ = CreateDBStatistics(); + + compaction_service_ = std::make_shared<MyTestCompactionService>( + dbname_, *options, compactor_statistics_, remote_listeners, + remote_table_properties_collector_factories); + options->compaction_service = compaction_service_; + DestroyAndReopen(*options); + } + + Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); } + + Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); } + + MyTestCompactionService* GetCompactionService() { + CompactionService* cs = compaction_service_.get(); + return static_cast_with_check<MyTestCompactionService>(cs); + } + + void GenerateTestData() { + // Generate 20 files @ L2 + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // Generate 10 files @ L1 overlap with all 20 files @ L2 + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + ASSERT_EQ(FilesPerLevel(), "0,10,20"); + } + + void VerifyTestData() { + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + } + + std::vector<std::shared_ptr<EventListener>> remote_listeners; + std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> + remote_table_properties_collector_factories; + + private: + std::shared_ptr<Statistics> compactor_statistics_; + std::shared_ptr<Statistics> primary_statistics_; + std::shared_ptr<CompactionService> compaction_service_; +}; + +TEST_F(CompactionServiceTest, BasicCompactions) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + Statistics* primary_statistics = GetPrimaryStatistics(); + Statistics* compactor_statistics = GetCompactorStatistics(); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + + // make sure the compaction statistics is only recorded on the remote side + ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1); + ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1); + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0); + // even with remote compaction, primary host still needs to read SST files to + // `verify_table()`. + ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1); + // all the compaction write happens on the remote side + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES)); + ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES), + primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES)); + // compactor is already the remote side, which doesn't have remote + ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0); + ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + 0); + + // Test failed compaction + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) { + // override job status + auto s = static_cast<Status*>(status); + *s = Status::Aborted("MyTestCompactionService failed to compact!"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s; + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + s = Put(Key(key_id), "value_new" + std::to_string(key_id)); + if (s.IsAborted()) { + break; + } + } + if (s.IsAborted()) { + break; + } + s = Flush(); + if (s.IsAborted()) { + break; + } + s = dbfull()->TEST_WaitForCompact(); + if (s.IsAborted()) { + break; + } + } + ASSERT_TRUE(s.IsAborted()); + + // Test re-open and successful unique id verification + std::atomic_int verify_passed{0}; + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) { + // override job status + auto id = static_cast<UniqueId64x2*>(arg); + assert(*id != kNullUniqueId64x2); + verify_passed++; + }); + Reopen(options); + ASSERT_GT(verify_passed, 0); + Close(); +} + +TEST_F(CompactionServiceTest, ManualCompaction) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + end_str = Key(92); + end = end_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + // Test cancel compaction at the beginning + my_cs->SetCanceled(true); + auto s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); + // compaction number is not increased + ASSERT_GE(my_cs->GetCompactionNum(), comp_num); + VerifyTestData(); + + // Test cancel compaction in progress + ReopenWithCompactionService(&options); + GenerateTestData(); + my_cs = GetCompactionService(); + my_cs->SetCanceled(false); + + std::atomic_bool cancel_issued{false}; + SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Inprogress", + [&](void* /*arg*/) { + cancel_issued = true; + my_cs->SetCanceled(true); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(cancel_issued); + // compaction number is not increased + ASSERT_GE(my_cs->GetCompactionNum(), comp_num); + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, FailedToStart) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); +} + +TEST_F(CompactionServiceTest, InvalidResult) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideWaitResult("Invalid Str"); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_FALSE(s.ok()); +} + +TEST_F(CompactionServiceTest, SubCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + int compaction_num_before = my_cs->GetCompactionNum(); + + auto cro = CompactRangeOptions(); + cro.max_subcompactions = 10; + Status s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(s); + VerifyTestData(); + int compaction_num = my_cs->GetCompactionNum() - compaction_num_before; + // make sure there's sub-compaction by checking the compaction number + ASSERT_GE(compaction_num, 2); +} + +class PartialDeleteCompactionFilter : public CompactionFilter { + public: + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int i = std::stoi(key.ToString().substr(3)); + if (i > 5 && i <= 105) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + const char* Name() const override { return "PartialDeleteCompactionFilter"; } +}; + +TEST_F(CompactionServiceTest, CompactionFilter) { + Options options = CurrentOptions(); + std::unique_ptr<CompactionFilter> delete_comp_filter( + new PartialDeleteCompactionFilter()); + options.compaction_filter = delete_comp_filter.get(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i > 5 && i <= 105) { + ASSERT_EQ(result, "NOT_FOUND"); + } else if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); +} + +TEST_F(CompactionServiceTest, Snapshot) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), "value2")); + ASSERT_OK(Put(Key(3), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + ASSERT_EQ("value1", Get(Key(1), s1)); + ASSERT_EQ("value2", Get(Key(1))); + db_->ReleaseSnapshot(s1); +} + +TEST_F(CompactionServiceTest, ConcurrentCompaction) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 100; + options.max_background_jobs = 20; + ReopenWithCompactionService(&options); + GenerateTestData(); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + std::vector<std::thread> threads; + for (const auto& file : meta.levels[1].files) { + threads.emplace_back(std::thread([&]() { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2)); + })); + } + + for (auto& thread : threads) { + thread.join(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_EQ(my_cs->GetCompactionNum(), 10); + ASSERT_EQ(FilesPerLevel(), "0,0,10"); +} + +TEST_F(CompactionServiceTest, CompactionInfo) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + auto my_cs = + static_cast_with_check<MyTestCompactionService>(GetCompactionService()); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_GE(comp_num, 1); + + CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(dbname_, info.db_name); + std::string db_id, db_session_id; + ASSERT_OK(db_->GetDbIdentity(db_id)); + ASSERT_EQ(db_id, info.db_id); + ASSERT_OK(db_->GetDbSessionId(db_session_id)); + ASSERT_EQ(db_session_id, info.db_session_id); + ASSERT_EQ(Env::LOW, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(dbname_, info.db_name); + ASSERT_EQ(db_id, info.db_id); + ASSERT_EQ(db_session_id, info.db_session_id); + ASSERT_EQ(Env::LOW, info.priority); + + // Test priority USER + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + SstFileMetaData file = meta.levels[1].files[0]; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), + {file.db_path + "/" + file.name}, 2)); + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(Env::USER, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(Env::USER, info.priority); + + // Test priority BOTTOM + env_->SetBackgroundThreads(1, Env::BOTTOM); + options.num_levels = 2; + ReopenWithCompactionService(&options); + my_cs = + static_cast_with_check<MyTestCompactionService>(GetCompactionService()); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(Env::BOTTOM, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(Env::BOTTOM, info.priority); +} + +TEST_F(CompactionServiceTest, FallbackLocalAuto) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + uint64_t compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + uint64_t primary_write_bytes = + primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } + + ASSERT_EQ(my_cs->GetCompactionNum(), 0); + + // make sure the compaction statistics is only recorded on the local side + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0); +} + +TEST_F(CompactionServiceTest, FallbackLocalManual) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + uint64_t compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + uint64_t primary_write_bytes = + primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + // re-enable remote compaction + my_cs->ResetOverride(); + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + // make sure the compaction statistics is only recorded on the remote side + ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES)); + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + + // return run local again with API WaitForComplete + my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal); + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(my_cs->GetCompactionNum(), + comp_num); // no remote compaction is run + // make sure the compaction statistics is only recorded on the local side + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_write_bytes); + + // verify result after 2 manual compactions + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, RemoteEventListener) { + class RemoteEventListenerTest : public EventListener { + public: + const char* Name() const override { return "RemoteEventListenerTest"; } + + void OnSubcompactionBegin(const SubcompactionJobInfo& info) override { + auto result = on_going_compactions.emplace(info.job_id); + ASSERT_TRUE(result.second); // make sure there's no duplication + compaction_num++; + EventListener::OnSubcompactionBegin(info); + } + void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override { + auto num = on_going_compactions.erase(info.job_id); + ASSERT_TRUE(num == 1); // make sure the compaction id exists + EventListener::OnSubcompactionCompleted(info); + } + void OnTableFileCreated(const TableFileCreationInfo& info) override { + ASSERT_EQ(on_going_compactions.count(info.job_id), 1); + file_created++; + EventListener::OnTableFileCreated(info); + } + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) override { + ASSERT_EQ(on_going_compactions.count(info.job_id), 1); + file_creation_started++; + EventListener::OnTableFileCreationStarted(info); + } + + bool ShouldBeNotifiedOnFileIO() override { + file_io_notified++; + return EventListener::ShouldBeNotifiedOnFileIO(); + } + + std::atomic_uint64_t file_io_notified{0}; + std::atomic_uint64_t file_creation_started{0}; + std::atomic_uint64_t file_created{0}; + + std::set<int> on_going_compactions; // store the job_id + std::atomic_uint64_t compaction_num{0}; + }; + + auto listener = new RemoteEventListenerTest(); + remote_listeners.emplace_back(listener); + + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // check the events are triggered + ASSERT_TRUE(listener->file_io_notified > 0); + ASSERT_TRUE(listener->file_creation_started > 0); + ASSERT_TRUE(listener->file_created > 0); + ASSERT_TRUE(listener->compaction_num > 0); + ASSERT_TRUE(listener->on_going_compactions.empty()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i)); + } + } +} + +TEST_F(CompactionServiceTest, TablePropertiesCollector) { + const static std::string kUserPropertyName = "TestCount"; + + class TablePropertiesCollectorTest : public TablePropertiesCollector { + public: + Status Finish(UserCollectedProperties* properties) override { + *properties = UserCollectedProperties{ + {kUserPropertyName, std::to_string(count_)}, + }; + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties(); + } + + const char* Name() const override { return "TablePropertiesCollectorTest"; } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + count_++; + return Status::OK(); + } + + private: + uint32_t count_ = 0; + }; + + class TablePropertiesCollectorFactoryTest + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new TablePropertiesCollectorTest(); + } + + const char* Name() const override { + return "TablePropertiesCollectorFactoryTest"; + } + }; + + auto factory = new TablePropertiesCollectorFactoryTest(); + remote_table_properties_collector_factories.emplace_back(factory); + + const int kNumSst = 3; + const int kLevel0Trigger = 4; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + ReopenWithCompactionService(&options); + + // generate a few SSTs locally which should not have user property + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props)); + for (const auto& file_props : fname_to_props) { + auto properties = file_props.second->user_collected_properties; + auto it = properties.find(kUserPropertyName); + ASSERT_EQ(it, properties.end()); + } + + // trigger compaction + for (int i = kNumSst; i < kLevel0Trigger; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props)); + + bool has_user_property = false; + for (const auto& file_props : fname_to_props) { + auto properties = file_props.second->user_collected_properties; + auto it = properties.find(kUserPropertyName); + if (it != properties.end()) { + has_user_property = true; + ASSERT_GT(std::stoi(it->second), 0); + } + } + ASSERT_TRUE(has_user_property); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_state.cc b/src/rocksdb/db/compaction/compaction_state.cc new file mode 100644 index 000000000..ee4b0c189 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_state.cc @@ -0,0 +1,46 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_state.h" + +namespace ROCKSDB_NAMESPACE { + +Slice CompactionState::SmallestUserKey() { + for (const auto& sub_compact_state : sub_compact_states) { + Slice smallest = sub_compact_state.SmallestUserKey(); + if (!smallest.empty()) { + return smallest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +Slice CompactionState::LargestUserKey() { + for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); + ++it) { + Slice largest = it->LargestUserKey(); + if (!largest.empty()) { + return largest; + } + } + // If there is no finished output, return an empty slice. + return Slice{nullptr, 0}; +} + +void CompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats) { + for (const auto& sc : sub_compact_states) { + sc.AggregateCompactionStats(compaction_stats); + compaction_job_stats.Add(sc.compaction_job_stats); + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_state.h b/src/rocksdb/db/compaction/compaction_state.h new file mode 100644 index 000000000..cc5b66c68 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_state.h @@ -0,0 +1,42 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction.h" +#include "db/compaction/subcompaction_state.h" +#include "db/internal_stats.h" + +// Data structures used for compaction_job and compaction_service_job which has +// the list of sub_compact_states and the aggregated information for the +// compaction. +namespace ROCKSDB_NAMESPACE { + +// Maintains state for the entire compaction +class CompactionState { + public: + Compaction* const compaction; + + // REQUIRED: subcompaction states are stored in order of increasing key-range + std::vector<SubcompactionState> sub_compact_states; + Status status; + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats, + CompactionJobStats& compaction_job_stats); + + explicit CompactionState(Compaction* c) : compaction(c) {} + + Slice SmallestUserKey(); + + Slice LargestUserKey(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/file_pri.h b/src/rocksdb/db/compaction/file_pri.h new file mode 100644 index 000000000..82dddcf93 --- /dev/null +++ b/src/rocksdb/db/compaction/file_pri.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once +#include <algorithm> + +#include "db/version_edit.h" + +namespace ROCKSDB_NAMESPACE { +// We boost files that are closer to TTL limit. This boosting could be +// through FileMetaData.compensated_file_size but this compensated size +// is widely used as something similar to file size so dramatically boost +// the value might cause unintended consequences. +// +// This boosting algorithm can go very fancy, but here we use a simple +// formula which can satisify: +// (1) Different levels are triggered slightly differently to avoid +// too many cascading cases +// (2) Files in the same level get boosting more when TTL gets closer. +// +// Don't do any boosting before TTL has past by half. This is to make +// sure lower write amp for most of the case. And all levels should be +// fully boosted when total TTL compaction threshold triggers. +// Differientiate boosting ranges of each level by 1/2. This will make +// range for each level exponentially increasing. We could do it by +// having them to be equal, or go even fancier. We can adjust it after +// we observe the behavior in production. +// The threshold starting boosting: +// +------------------------------------------------------------------ + +// ^ ^ ^ ^ ^ ^ +// Age 0 ... | | second last level thresold +// | | +// | third last level +// | +// forth last level +// +// We arbitrarily set with 0 when a file is aged boost_age_start and +// grow linearly. The ratio is arbitrarily set so that when the next level +// starts to boost, the previous level's boosting amount is 16. +class FileTtlBooster { + public: + FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels, + int level) + : current_time_(current_time) { + if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) { + enabled_ = false; + boost_age_start_ = 0; + boost_step_ = 1; + } else { + enabled_ = true; + uint64_t all_boost_start_age = ttl / 2; + uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age; + uint64_t boost_age_range = + all_boost_age_range >> (num_non_empty_levels - level - 1); + boost_age_start_ = all_boost_start_age + boost_age_range; + const uint64_t kBoostRatio = 16; + // prevent 0 value to avoid divide 0 error. + boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1}); + } + } + + uint64_t GetBoostScore(FileMetaData* f) { + if (!enabled_) { + return 1; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time >= current_time_) { + return 1; + } + uint64_t age = current_time_ - oldest_ancester_time; + if (age > boost_age_start_) { + // Use integer just for convenience. + // We could make all file_to_order double if we want. + // Technically this can overflow if users override timing and + // give a very high current time. Ignore the case for simplicity. + // Boosting is addition to current value, so +1. This will effectively + // make boosting to kick in after the first boost_step_ is reached. + return (age - boost_age_start_) / boost_step_ + 1; + } + return 1; + } + + private: + bool enabled_; + uint64_t current_time_; + uint64_t boost_age_start_; + uint64_t boost_step_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/sst_partitioner.cc b/src/rocksdb/db/compaction/sst_partitioner.cc new file mode 100644 index 000000000..9e7f9fa89 --- /dev/null +++ b/src/rocksdb/db/compaction/sst_partitioner.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "rocksdb/sst_partitioner.h" + +#include <algorithm> + +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map<std::string, OptionTypeInfo> + sst_fixed_prefix_type_info = { +#ifndef ROCKSDB_LITE + {"length", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len) + : len_(len) { + RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info); +} + +PartitionerResult SstPartitionerFixedPrefix::ShouldPartition( + const PartitionerRequest& request) { + Slice last_key_fixed(*request.prev_user_key); + if (last_key_fixed.size() > len_) { + last_key_fixed.size_ = len_; + } + Slice current_key_fixed(*request.current_user_key); + if (current_key_fixed.size() > len_) { + current_key_fixed.size_ = len_; + } + return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired + : kNotRequired; +} + +bool SstPartitionerFixedPrefix::CanDoTrivialMove( + const Slice& smallest_user_key, const Slice& largest_user_key) { + return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key, + 0)) == kNotRequired; +} + +std::unique_ptr<SstPartitioner> +SstPartitionerFixedPrefixFactory::CreatePartitioner( + const SstPartitioner::Context& /* context */) const { + return std::unique_ptr<SstPartitioner>(new SstPartitionerFixedPrefix(len_)); +} + +std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory( + size_t prefix_len) { + return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len); +} + +#ifndef ROCKSDB_LITE +namespace { +static int RegisterSstPartitionerFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory<SstPartitionerFactory>( + SstPartitionerFixedPrefixFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr<SstPartitionerFactory>* guard, + std::string* /* errmsg */) { + guard->reset(new SstPartitionerFixedPrefixFactory(0)); + return guard->get(); + }); + return 1; +} +} // namespace +#endif // ROCKSDB_LITE + +Status SstPartitionerFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr<SstPartitionerFactory>* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr, + result); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/subcompaction_state.cc b/src/rocksdb/db/compaction/subcompaction_state.cc new file mode 100644 index 000000000..0c56471e9 --- /dev/null +++ b/src/rocksdb/db/compaction/subcompaction_state.cc @@ -0,0 +1,106 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/subcompaction_state.h" + +#include "rocksdb/sst_partitioner.h" + +namespace ROCKSDB_NAMESPACE { +void SubcompactionState::AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const { + compaction_stats.stats.Add(compaction_outputs_.stats_); + if (HasPenultimateLevelOutputs()) { + compaction_stats.has_penultimate_level_output = true; + compaction_stats.penultimate_level_stats.Add( + penultimate_level_outputs_.stats_); + } +} + +OutputIterator SubcompactionState::GetOutputs() const { + return OutputIterator(penultimate_level_outputs_.outputs_, + compaction_outputs_.outputs_); +} + +void SubcompactionState::Cleanup(Cache* cache) { + penultimate_level_outputs_.Cleanup(); + compaction_outputs_.Cleanup(); + + if (!status.ok()) { + for (const auto& out : GetOutputs()) { + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + TableCache::Evict(cache, out.meta.fd.GetNumber()); + } + } + // TODO: sub_compact.io_status is not checked like status. Not sure if thats + // intentional. So ignoring the io_status as of now. + io_status.PermitUncheckedError(); +} + +Slice SubcompactionState::SmallestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.SmallestUserKey(); + Slice b = penultimate_level_outputs_.SmallestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) > 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.SmallestUserKey(); + } +} + +Slice SubcompactionState::LargestUserKey() const { + if (has_penultimate_level_outputs_) { + Slice a = compaction_outputs_.LargestUserKey(); + Slice b = penultimate_level_outputs_.LargestUserKey(); + if (a.empty()) { + return b; + } + if (b.empty()) { + return a; + } + const Comparator* user_cmp = + compaction->column_family_data()->user_comparator(); + if (user_cmp->Compare(a, b) < 0) { + return b; + } else { + return a; + } + } else { + return compaction_outputs_.LargestUserKey(); + } +} + +Status SubcompactionState::AddToOutput( + const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // update target output first + is_current_penultimate_level_ = iter.output_to_penultimate_level(); + current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_ + : &compaction_outputs_; + if (is_current_penultimate_level_) { + has_penultimate_level_outputs_ = true; + } + + return Current().AddToOutput(iter, open_file_func, close_file_func); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/subcompaction_state.h b/src/rocksdb/db/compaction/subcompaction_state.h new file mode 100644 index 000000000..13e63120f --- /dev/null +++ b/src/rocksdb/db/compaction/subcompaction_state.h @@ -0,0 +1,214 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <optional> + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_garbage_meter.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iterator.h" +#include "db/compaction/compaction_outputs.h" +#include "db/internal_stats.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" + +namespace ROCKSDB_NAMESPACE { + +// Maintains state and outputs for each sub-compaction +// It contains 2 `CompactionOutputs`: +// 1. one for the normal output files +// 2. another for the penultimate level outputs +// a `current` pointer maintains the current output group, when calling +// `AddToOutput()`, it checks the output of the current compaction_iterator key +// and point `current` to the target output group. By default, it just points to +// normal compaction_outputs, if the compaction_iterator key should be placed on +// the penultimate level, `current` is changed to point to +// `penultimate_level_outputs`. +// The later operations uses `Current()` to get the target group. +// +// +----------+ +-----------------------------+ +---------+ +// | *current |--------> | compaction_outputs |----->| output | +// +----------+ +-----------------------------+ +---------+ +// | | output | +// | +---------+ +// | | ... | +// | +// | +-----------------------------+ +---------+ +// +-------------> | penultimate_level_outputs |----->| output | +// +-----------------------------+ +---------+ +// | ... | + +class SubcompactionState { + public: + const Compaction* compaction; + + // The boundaries of the key-range this compaction is interested in. No two + // sub-compactions may have overlapping key-ranges. + // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded + const std::optional<Slice> start, end; + + // The return status of this sub-compaction + Status status; + + // The return IO Status of this sub-compaction + IOStatus io_status; + + // Notify on sub-compaction completion only if listener was notified on + // sub-compaction begin. + bool notify_on_subcompaction_completion = false; + + // compaction job stats for this sub-compaction + CompactionJobStats compaction_job_stats; + + // sub-compaction job id, which is used to identify different sub-compaction + // within the same compaction job. + const uint32_t sub_job_id; + + Slice SmallestUserKey() const; + + Slice LargestUserKey() const; + + // Get all outputs from the subcompaction. For per_key_placement compaction, + // it returns both the last level outputs and penultimate level outputs. + OutputIterator GetOutputs() const; + + // Assign range dels aggregator, for each range_del, it can only be assigned + // to one output level, for per_key_placement, it's going to be the + // penultimate level. + void AssignRangeDelAggregator( + std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) { + if (compaction->SupportsPerKeyPlacement()) { + penultimate_level_outputs_.AssignRangeDelAggregator( + std::move(range_del_agg)); + } else { + compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg)); + } + } + + void RemoveLastEmptyOutput() { + compaction_outputs_.RemoveLastEmptyOutput(); + penultimate_level_outputs_.RemoveLastEmptyOutput(); + } + +#ifndef ROCKSDB_LITE + void BuildSubcompactionJobInfo( + SubcompactionJobInfo& subcompaction_job_info) const { + const Compaction* c = compaction; + const ColumnFamilyData* cfd = c->column_family_data(); + + subcompaction_job_info.cf_id = cfd->GetID(); + subcompaction_job_info.cf_name = cfd->GetName(); + subcompaction_job_info.status = status; + subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id); + subcompaction_job_info.base_input_level = c->start_level(); + subcompaction_job_info.output_level = c->output_level(); + subcompaction_job_info.stats = compaction_job_stats; + } +#endif // !ROCKSDB_LITE + + SubcompactionState() = delete; + SubcompactionState(const SubcompactionState&) = delete; + SubcompactionState& operator=(const SubcompactionState&) = delete; + + SubcompactionState(Compaction* c, const std::optional<Slice> _start, + const std::optional<Slice> _end, uint32_t _sub_job_id) + : compaction(c), + start(_start), + end(_end), + sub_job_id(_sub_job_id), + compaction_outputs_(c, /*is_penultimate_level=*/false), + penultimate_level_outputs_(c, /*is_penultimate_level=*/true) { + assert(compaction != nullptr); + // Set output split key (used for RoundRobin feature) only for normal + // compaction_outputs, output to penultimate_level feature doesn't support + // RoundRobin feature (and may never going to be supported, because for + // RoundRobin, the data time is mostly naturally sorted, no need to have + // per-key placement with output_to_penultimate_level). + compaction_outputs_.SetOutputSlitKey(start, end); + } + + SubcompactionState(SubcompactionState&& state) noexcept + : compaction(state.compaction), + start(state.start), + end(state.end), + status(std::move(state.status)), + io_status(std::move(state.io_status)), + notify_on_subcompaction_completion( + state.notify_on_subcompaction_completion), + compaction_job_stats(std::move(state.compaction_job_stats)), + sub_job_id(state.sub_job_id), + compaction_outputs_(std::move(state.compaction_outputs_)), + penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)), + is_current_penultimate_level_(state.is_current_penultimate_level_), + has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) { + current_outputs_ = is_current_penultimate_level_ + ? &penultimate_level_outputs_ + : &compaction_outputs_; + } + + bool HasPenultimateLevelOutputs() const { + return has_penultimate_level_outputs_ || + penultimate_level_outputs_.HasRangeDel(); + } + + bool IsCurrentPenultimateLevel() const { + return is_current_penultimate_level_; + } + + // Add all the new files from this compaction to version_edit + void AddOutputsEdit(VersionEdit* out_edit) const { + for (const auto& file : penultimate_level_outputs_.outputs_) { + out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta); + } + for (const auto& file : compaction_outputs_.outputs_) { + out_edit->AddFile(compaction->output_level(), file.meta); + } + } + + void Cleanup(Cache* cache); + + void AggregateCompactionStats( + InternalStats::CompactionStatsFull& compaction_stats) const; + + CompactionOutputs& Current() const { + assert(current_outputs_); + return *current_outputs_; + } + + // Add compaction_iterator key/value to the `Current` output group. + Status AddToOutput(const CompactionIterator& iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + + // Close all compaction output files, both output_to_penultimate_level outputs + // and normal outputs. + Status CloseCompactionFiles(const Status& curr_status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output file. + Status s = penultimate_level_outputs_.CloseOutput( + curr_status, open_file_func, close_file_func); + s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func); + return s; + } + + private: + // State kept for output being generated + CompactionOutputs compaction_outputs_; + CompactionOutputs penultimate_level_outputs_; + CompactionOutputs* current_outputs_ = &compaction_outputs_; + bool is_current_penultimate_level_ = false; + bool has_penultimate_level_outputs_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/tiered_compaction_test.cc b/src/rocksdb/db/compaction/tiered_compaction_test.cc new file mode 100644 index 000000000..aaebcfd94 --- /dev/null +++ b/src/rocksdb/db/compaction/tiered_compaction_test.cc @@ -0,0 +1,2028 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/listener.h" +#include "rocksdb/utilities/debug.h" +#include "test_util/mock_time_env.h" + +namespace ROCKSDB_NAMESPACE { + +#if !defined(ROCKSDB_LITE) + +class TieredCompactionTest : public DBTestBase, + public testing::WithParamInterface<bool> { + public: + TieredCompactionTest() + : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true), + kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1), + kBasicPerKeyPlacementCompStats( + CompactionReason::kUniversalSizeAmplification, 1), + kBasicFlushStats(CompactionReason::kFlush, 1) { + kBasicCompStats.micros = kHasValue; + kBasicCompStats.cpu_micros = kHasValue; + kBasicCompStats.bytes_read_non_output_levels = kHasValue; + kBasicCompStats.num_input_files_in_non_output_levels = kHasValue; + kBasicCompStats.num_input_records = kHasValue; + kBasicCompStats.num_dropped_records = kHasValue; + + kBasicPerLevelStats.num_output_records = kHasValue; + kBasicPerLevelStats.bytes_written = kHasValue; + kBasicPerLevelStats.num_output_files = kHasValue; + + kBasicPerKeyPlacementCompStats.micros = kHasValue; + kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue; + kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats); + + kBasicFlushStats.micros = kHasValue; + kBasicFlushStats.cpu_micros = kHasValue; + kBasicFlushStats.bytes_written = kHasValue; + kBasicFlushStats.num_output_files = kHasValue; + } + + protected: + static constexpr uint8_t kHasValue = 1; + + InternalStats::CompactionStats kBasicCompStats; + InternalStats::CompactionStats kBasicPerKeyPlacementCompStats; + InternalStats::CompactionOutputsStats kBasicPerLevelStats; + InternalStats::CompactionStats kBasicFlushStats; + + std::atomic_bool enable_per_key_placement = true; + + void SetUp() override { + SyncPoint::GetInstance()->SetCallBack( + "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { + auto supports_per_key_placement = static_cast<bool*>(arg); + *supports_per_key_placement = enable_per_key_placement; + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + + const std::vector<InternalStats::CompactionStats>& GetCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } + + const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetPerKeyPlacementCompactionStats(); + } + + // Verify the compaction stats, the stats are roughly compared + void VerifyCompactionStats( + const std::vector<InternalStats::CompactionStats>& expect_stats, + const InternalStats::CompactionStats& expect_pl_stats) { + const std::vector<InternalStats::CompactionStats>& stats = + GetCompactionStats(); + const size_t kLevels = expect_stats.size(); + ASSERT_EQ(kLevels, stats.size()); + + for (auto it = stats.begin(), expect = expect_stats.begin(); + it != stats.end(); it++, expect++) { + VerifyCompactionStats(*it, *expect); + } + + const InternalStats::CompactionStats& pl_stats = + GetPerKeyPlacementCompactionStats(); + VerifyCompactionStats(pl_stats, expect_pl_stats); + } + + void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats, + InternalStats::CompactionStats& pl_stats) { + ASSERT_OK(dbfull()->ResetStats()); + for (auto& level_stats : stats) { + level_stats.Clear(); + } + pl_stats.Clear(); + } + + // bottommost_temperature is renaming to last_level_temperature, set either + // of them should have the same effect. + void SetColdTemperature(Options& options) { + if (GetParam()) { + options.bottommost_temperature = Temperature::kCold; + } else { + options.last_level_temperature = Temperature::kCold; + } + } + + private: + void CompareStats(uint64_t val, uint64_t expect) { + if (expect > 0) { + ASSERT_TRUE(val > 0); + } else { + ASSERT_EQ(val, 0); + } + } + + void VerifyCompactionStats( + const InternalStats::CompactionStats& stats, + const InternalStats::CompactionStats& expect_stats) { + CompareStats(stats.micros, expect_stats.micros); + CompareStats(stats.cpu_micros, expect_stats.cpu_micros); + CompareStats(stats.bytes_read_non_output_levels, + expect_stats.bytes_read_non_output_levels); + CompareStats(stats.bytes_read_output_level, + expect_stats.bytes_read_output_level); + CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob); + CompareStats(stats.bytes_written, expect_stats.bytes_written); + CompareStats(stats.bytes_moved, expect_stats.bytes_moved); + CompareStats(stats.num_input_files_in_non_output_levels, + expect_stats.num_input_files_in_non_output_levels); + CompareStats(stats.num_input_files_in_output_level, + expect_stats.num_input_files_in_output_level); + CompareStats(stats.num_output_files, expect_stats.num_output_files); + CompareStats(stats.num_output_files_blob, + expect_stats.num_output_files_blob); + CompareStats(stats.num_input_records, expect_stats.num_input_records); + CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records); + CompareStats(stats.num_output_records, expect_stats.num_output_records); + ASSERT_EQ(stats.count, expect_stats.count); + for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons); + i++) { + ASSERT_EQ(stats.counts[i], expect_stats.counts[i]); + } + } +}; + +TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + std::vector<SequenceNumber> seq_history; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // the penultimate level file temperature is not cold, all data are output to + // the penultimate level. + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // basic compaction stats are still counted to the last level + expect_stats[kLastLevel].Add(kBasicCompStats); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + + VerifyCompactionStats(expect_stats, expect_pl_stats); + + ResetAllStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq to split the file into 2 levels, so should have + // both the last level stats and the output_to_penultimate_level stats + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // delete all cold data, so all data will be on penultimate level + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq again with range delete, take a snapshot to keep + // the range dels in both cold and hot SSTs + auto snap = db_->GetSnapshot(); + latest_cold_seq = seq_history[2]; + std::string start = Key(25), end = Key(35); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + + ResetAllStats(expect_stats, expect_pl_stats); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // verify data + std::string value; + for (int i = 0; i < kNumKeys; i++) { + if (i < 10 || (i >= 25 && i < 35)) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + // range delete all hot data + start = Key(30); + end = Key(130); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // release the snapshot and do compaction again should remove all hot data + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // 2 range dels are dropped + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 3); + + // move backward the cold_seq, for example the user may change the setting of + // hot/cold data, but it won't impact the existing cold data, as the sequence + // number is zeroed out. + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + auto cmp = options.comparator; + + port::Mutex mutex; + std::string hot_start = Key(10); + std::string hot_end = Key(50); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + MutexLock l(&mutex); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + ResetAllStats(expect_stats, expect_pl_stats); + + // change to all cold, no output_to_penultimate_level output + { + MutexLock l(&mutex); + hot_start = Key(100); + hot_end = Key(200); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // change to all hot, universal compaction support moving data to up level if + // it's within compaction level range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + + // No data is moved from cold tier to hot tier because no input files from L5 + // or higher, it's not safe to move data to output_to_penultimate_level level. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + + // Add 2 keys in higher level, but in separated files, all keys can be moved + // up if it's hot + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(50), "value" + std::to_string(0))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // change to only 1 key cold, to test compaction could stop even it matches + // size amp compaction threshold + { + MutexLock l(&mutex); + hot_start = Key(1); + hot_end = Key(1000); + } + + // generate files just enough to trigger compaction + for (int i = 0; i < kNumTrigger - 1; i++) { + for (int j = 0; j < 1000; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact( + true)); // make sure the compaction is able to finish + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + auto opts = db_->GetOptions(); + auto max_size_amp = + opts.compaction_options_universal.max_size_amplification_percent / 100; + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), + GetSstSizeHelper(Temperature::kCold) * max_size_amp); + + // delete all cold data + ASSERT_OK(Delete(Key(0))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // range delete overlap with both hot/cold data, with a snapshot to make sure + // the range del is saved + auto snap = db_->GetSnapshot(); + { + MutexLock l(&mutex); + hot_start = Key(50); + hot_end = Key(100); + } + std::string start = Key(1), end = Key(70); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped until snapshot is released + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // verify data + std::string value; + for (int i = 0; i < kNumKeys; i++) { + if (i < 70) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // range del is dropped + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 1); +} + +TEST_P(TieredCompactionTest, LevelColdRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,1", + FilesPerLevel()); // bottommost but not last level file is hot + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // explicitly move the data to the last level + MoveFilesToLevel(kLastLevel); + + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + auto snap = db_->GetSnapshot(); + + std::string start = Key(10); + std::string end = Key(50); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + + // 20->30 will be marked as cold data, but it cannot be placed to cold tier + // (bottommost) otherwise, it will be "deleted" by the range del in + // output_to_penultimate_level level verify that these data will be able to + // queried + for (int i = 20; i < 30; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + // make the range tombstone and data after that cold + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + + // add home hot data, just for test + for (int i = 30; i < 40; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + std::string value; + for (int i = 0; i < kNumKeys; i++) { + auto s = db_->Get(ReadOptions(), Key(i), &value); + if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) { + ASSERT_TRUE(s.IsNotFound()); + } else { + ASSERT_OK(s); + } + } + + db_->ReleaseSnapshot(snap); +} + +// Test SST partitioner cut after every single key +class SingleKeySstPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& /*request*/) override { + return kRequired; + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } +}; + +class SingleKeySstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { return "SingleKeySstPartitionerFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr<SstPartitioner> CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner()); + } +}; + +TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 3; + const int kNumKeys = 10; + + auto factory = std::make_shared<SingleKeySstPartitionerFactory>(); + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.sst_partitioner_factory = factory; + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + MoveFilesToLevel(kNumLevels - 1); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ("0,0,10", FilesPerLevel()); + + auto snap = db_->GetSnapshot(); + + // only range delete + std::string start = Key(3); + std::string end = Key(5); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), + 0); // tombstone has no size, even it's in hot tier + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_EQ("0,1,10", + FilesPerLevel()); // one file is at the penultimate level which + // only contains a range delete + + // Add 2 hot keys, each is a new SST, they will be placed in the same level as + // range del, but they don't have overlap with range del, make sure the range + // del will still be placed there + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Put(Key(0), "new value" + std::to_string(0))); + auto snap2 = db_->GetSnapshot(); + ASSERT_OK(Put(Key(6), "new value" + std::to_string(6))); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,2,10", + FilesPerLevel()); // one file is at the penultimate level + // which only contains a range delete + std::vector<LiveFileMetaData> live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + bool found_sst_with_del = false; + uint64_t sst_with_del_num = 0; + for (const auto& meta : live_file_meta) { + if (meta.num_deletions > 0) { + // found SST with del, which has 2 entries, one for data one for range del + ASSERT_EQ(meta.level, + kNumLevels - 2); // output to penultimate level + ASSERT_EQ(meta.num_entries, 2); + ASSERT_EQ(meta.num_deletions, 1); + found_sst_with_del = true; + sst_with_del_num = meta.file_number; + } + } + ASSERT_TRUE(found_sst_with_del); + + // release the first snapshot and compact, which should compact the range del + // but new inserted key `0` and `6` are still hot data which will be placed on + // the penultimate level + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,2,7", FilesPerLevel()); + db_->GetLiveFilesMetaData(&live_file_meta); + found_sst_with_del = false; + for (const auto& meta : live_file_meta) { + // check new SST with del (the old one may not yet be deleted after + // compaction) + if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) { + found_sst_with_del = true; + } + } + ASSERT_FALSE(found_sst_with_del); + + // Now make all data cold, key 0 will be moved to the last level, but key 6 is + // still in snap2, so it will be kept at the penultimate level + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,1,8", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->ReleaseSnapshot(snap2); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,8", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, UniversalRangeDelete) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 10; + + auto factory = std::make_shared<SingleKeySstPartitionerFactory>(); + + auto options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.statistics = CreateDBStatistics(); + options.sst_partitioner_factory = factory; + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + // compact to the penultimate level with 10 files + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + // make all data cold + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // range del which considered as hot data, but it will be merged and deleted + // with the last level data + std::string start = Key(3); + std::string end = Key(5); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel()); + + // range del with snapshot should be preserved in the penultimate level + auto snap = db_->GetSnapshot(); + + start = Key(6); + end = Key(8); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel()); + + // Add 2 hot keys, each is a new SST, they will be placed in the same level as + // range del, but no overlap with range del. + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(Put(Key(4), "new value" + std::to_string(0))); + auto snap2 = db_->GetSnapshot(); + ASSERT_OK(Put(Key(9), "new value" + std::to_string(6))); + + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel()); + // find the SST with range del + std::vector<LiveFileMetaData> live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + bool found_sst_with_del = false; + uint64_t sst_with_del_num = 0; + for (const auto& meta : live_file_meta) { + if (meta.num_deletions > 0) { + // found SST with del, which has 2 entries, one for data one for range del + ASSERT_EQ(meta.level, + kNumLevels - 2); // output_to_penultimate_level level + ASSERT_EQ(meta.num_entries, 2); + ASSERT_EQ(meta.num_deletions, 1); + found_sst_with_del = true; + sst_with_del_num = meta.file_number; + } + } + ASSERT_TRUE(found_sst_with_del); + + // release the first snapshot which should compact the range del, but data on + // the same level is still hot + db_->ReleaseSnapshot(snap); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel()); + db_->GetLiveFilesMetaData(&live_file_meta); + // no range del should be found in SST + found_sst_with_del = false; + for (const auto& meta : live_file_meta) { + // check new SST with del (the old one may not yet be deleted after + // compaction) + if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) { + found_sst_with_del = true; + } + } + ASSERT_FALSE(found_sst_with_del); + + // make all data to cold, but key 6 is still protected by snap2 + latest_cold_seq = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->ReleaseSnapshot(snap2); + + // release snapshot, everything go to bottommost + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kLastLevel = kNumLevels - 1; + + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + + std::atomic_uint64_t latest_cold_seq = 0; + std::vector<SequenceNumber> seq_history; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + context->output_to_penultimate_level = + context->seq_num > latest_cold_seq; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels); + InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; + InternalStats::CompactionStats expect_pl_stats; + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + expect_stats[0].Add(kBasicFlushStats); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // non last level is hot + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + expect_stats[1].Add(kBasicCompStats); + expect_stats[1].Add(kBasicPerLevelStats); + expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move all data to the last level + MoveFilesToLevel(kLastLevel); + + ResetAllStats(expect_stats, expect_pl_stats); + + // The compaction won't move the data up + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + last_stats.Add(kBasicCompStats); + last_stats.Add(kBasicPerLevelStats); + last_stats.num_dropped_records = 0; + last_stats.bytes_read_non_output_levels = 0; + last_stats.num_input_files_in_non_output_levels = 0; + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // Add new data, which is all hot and overriding all existing data + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + ResetAllStats(expect_stats, expect_pl_stats); + + // after compaction, all data are hot + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + for (int level = 2; level < kNumLevels - 1; level++) { + expect_stats[level].bytes_moved = kHasValue; + } + + last_stats.Add(kBasicCompStats); + last_stats.bytes_read_output_level = kHasValue; + last_stats.num_input_files_in_output_level = kHasValue; + last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); + VerifyCompactionStats(expect_stats, expect_pl_stats); + + // move forward the cold_seq, try to split the data into cold and hot, but in + // this case it's unsafe to split the data + // because it's non-last-level but bottommost file, the sequence number will + // be zeroed out and lost the time information (with + // `level_compaction_dynamic_level_bytes` or Universal Compaction, it should + // be rare.) + // TODO(zjay): ideally we should avoid zero out non-last-level bottommost file + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + seq_history.clear(); + + // manually move all data (cold) to last level + MoveFilesToLevel(kLastLevel); + seq_history.clear(); + // Add new data once again + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + latest_cold_seq = seq_history[0]; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // delete all cold data + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + + latest_cold_seq = seq_history[2]; + + MoveFilesToLevel(kLastLevel); + + // move forward the cold_seq again with range delete, take a snapshot to keep + // the range dels in bottommost + auto snap = db_->GetSnapshot(); + + std::string start = Key(25), end = Key(35); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + // add one small key and large key in the input level, to make sure it's able + // to move hot data to input level within that range + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(100), "value" + std::to_string(0))); + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // verify data + std::string value; + for (int i = 1; i < 130; i++) { + if (i < 10 || (i >= 25 && i < 35)) { + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + } + } + + // delete all hot data + ASSERT_OK(Delete(Key(0))); + start = Key(30); + end = Key(101); // range [101, 130] is cold, because it's not in input range + // in previous compaction + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + db_->ReleaseSnapshot(snap); + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // 3 range dels dropped, the first one is double counted as expected, which is + // spread into 2 SST files + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 3); + + // move backward of cold_seq, which might happen when the user change the + // setting. the hot data won't move up, just to make sure it still runs + // fine, which is because: + // 1. sequence number is zeroed out, so no time information + // 2. leveled compaction only support move data up within the higher level + // input range + latest_cold_seq = seq_history[1]; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); +} + +TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + DestroyAndReopen(options); + auto cmp = options.comparator; + + port::Mutex mutex; + std::string hot_start = Key(10); + std::string hot_end = Key(50); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast<PerKeyPlacementContext*>(arg); + MutexLock l(&mutex); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumTrigger; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // change to all cold + { + MutexLock l(&mutex); + hot_start = Key(100); + hot_end = Key(200); + } + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // change to all hot, but level compaction only support move cold to hot + // within it's higher level input range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // with mixed hot/cold data + { + MutexLock l(&mutex); + hot_start = Key(50); + hot_end = Key(100); + } + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(100), "value" + std::to_string(100))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // delete all hot data, but with snapshot to keep the range del + auto snap = db_->GetSnapshot(); + std::string start = Key(50); + std::string end = Key(100); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + // no range del is dropped because of snapshot + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 0); + + // release the snapshot and do compaction again should remove all hot data + db_->ReleaseSnapshot(snap); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + ASSERT_EQ( + options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), + 1); +} + +INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest, + testing::Bool()); + +class PrecludeLastLevelTest : public DBTestBase { + public: + PrecludeLastLevelTest() + : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) { + mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock()); + mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_); + } + + protected: + std::unique_ptr<Env> mock_env_; + std::shared_ptr<MockSystemClock> mock_clock_; + + void SetUp() override { + mock_clock_->InstallTimedWaitFixCallback(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { + auto periodic_task_scheduler_ptr = + reinterpret_cast<PeriodicTaskScheduler*>(arg); + periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); + }); + mock_clock_->SetCurrentTime(0); + } +}; + +TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 10000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + // all data is hot, even they're in the last level + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // Generate a sstable and trigger manual compaction + ASSERT_OK(Put(Key(10), "value")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // all data is moved up to the penultimate level + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // close explicitly, because the env is local variable which will be released + // first. + Close(); +} + +TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 10000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + // make sure it won't trigger Size Amp compaction, unlike normal Size Amp + // compaction which is typically a last level compaction, when tiered Storage + // ("preclude_last_level") is enabled, size amp won't include the last level. + // As the last level would be in cold tier and the size would not be a + // problem, which also avoid frequent hot to cold storage compaction. + options.compaction_options_universal.max_size_amplification_percent = 400; + Reopen(options); + + // all data is hot, even they're in the last level + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // Write more data, but still all hot until the 10th SST, as: + // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds + // The preclude_last_level_data_seconds is 10k + Random rnd(301); + for (; sst_num < kNumTrigger * 2 - 1; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + // the value needs to be big enough to trigger full compaction + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->WaitForCompact(true)); + } + + // all data is moved up to the penultimate level + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + // close explicitly, because the env is local variable which will be released + // first. + Close(); +} + +TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 2000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + std::vector<KeyVersion> key_versions; + ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits<size_t>::max(), + &key_versions)); + + // make sure there're more than 300 keys and first 100 keys are having seqno + // zeroed out, the last 100 key seqno not zeroed out + ASSERT_GT(key_versions.size(), 300); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(key_versions[i].sequence, 0); + } + auto rit = key_versions.rbegin(); + for (int i = 0; i < 100; i++) { + ASSERT_GT(rit->sequence, 0); + rit++; + } + + // enable preclude feature + options.preclude_last_level_data_seconds = 2000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + // Generate a sstable and trigger manual compaction + ASSERT_OK(Put(Key(10), "value")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // some data are moved up, some are not + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + Close(); +} + +TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preclude_last_level_data_seconds = 60; + options.preserve_internal_time_seconds = 0; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.last_level_temperature = Temperature::kCold; + DestroyAndReopen(options); + + Random rnd(301); + + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(10) + 1)); + }); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(2))); + }); + } + ASSERT_OK(Flush()); + + TablePropertiesCollection tables_props; + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); + ASSERT_EQ(tables_props.size(), 1); + ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty()); + SeqnoToTimeMapping tp_mapping; + ASSERT_OK( + tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping)); + ASSERT_OK(tp_mapping.Sort()); + ASSERT_FALSE(tp_mapping.Empty()); + auto seqs = tp_mapping.TEST_GetInternalMapping(); + ASSERT_FALSE(seqs.empty()); + + // Wait more than preclude_last_level time, then make sure all the data is + // compacted to the last level even there's no write (no seqno -> time + // information was flushed to any SST). + mock_clock_->MockSleepForSeconds(100); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + Close(); +} + +TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 2000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); }); + + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + options.preclude_last_level_data_seconds = 2000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // some data are moved up, some are not + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + + std::vector<KeyVersion> key_versions; + ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits<size_t>::max(), + &key_versions)); + + // make sure there're more than 300 keys and first 100 keys are having seqno + // zeroed out, the last 100 key seqno not zeroed out + ASSERT_GT(key_versions.size(), 300); + for (int i = 0; i < 100; i++) { + ASSERT_EQ(key_versions[i].sequence, 0); + } + auto rit = key_versions.rbegin(); + for (int i = 0; i < 100; i++) { + ASSERT_GT(rit->sequence, 0); + rit++; + } + + Close(); +} + +class PrecludeLastLevelTestWithParms + : public PrecludeLastLevelTest, + public testing::WithParamInterface<bool> { + public: + PrecludeLastLevelTestWithParms() : PrecludeLastLevelTest() {} +}; + +TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kNumKeys = 100; + const int kKeyPerSec = 10; + + bool enable_preclude_last_level = GetParam(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 2000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); }); + + Random rnd(301); + int sst_num = 0; + // Write files that are overlap and enough to trigger compaction + for (; sst_num < kNumTrigger; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // all data is pushed to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + std::atomic_bool is_manual_compaction_running = false; + std::atomic_bool verified_compaction_order = false; + + // Make sure the manual compaction is in progress and try to trigger a + // SizeRatio compaction by flushing 4 files to L0. The compaction will try to + // compact 4 files at L0 to L5 (the last empty level). + // If the preclude_last_feature is enabled, the auto triggered compaction + // cannot be picked. Otherwise, the auto triggered compaction can run in + // parallel with the last level compaction. + // L0: [a] [b] [c] [d] + // L5: (locked if preclude_last_level is enabled) + // L6: [z] (locked: manual compaction in progress) + // TODO: in this case, L0 files should just be compacted to L4, so the 2 + // compactions won't be overlapped. + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { + auto compaction = static_cast<Compaction*>(arg); + if (compaction->is_manual_compaction()) { + is_manual_compaction_running = true; + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction1"); + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction2"); + is_manual_compaction_running = false; + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + auto compaction = static_cast<Compaction*>(arg); + if (enable_preclude_last_level && is_manual_compaction_running) { + ASSERT_TRUE(compaction == nullptr); + verified_compaction_order = true; + } else { + ASSERT_TRUE(compaction != nullptr); + verified_compaction_order = true; + } + if (!compaction || !compaction->is_manual_compaction()) { + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "AutoCompactionPicked"); + } + }); + + SyncPoint::GetInstance()->LoadDependency({ + {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction1", + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"}, + {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "AutoCompactionPicked", + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:" + "ManualCompaction2"}, + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + // only enable if the Parameter is true + if (enable_preclude_last_level) { + options.preclude_last_level_data_seconds = 2000; + } + options.max_background_jobs = 8; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"); + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + for (; sst_num < kNumTrigger * 2; sst_num++) { + for (int i = 0; i < kNumKeys; i++) { + // the value needs to be big enough to trigger full compaction + ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); + dbfull()->TEST_WaitForPeridicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); + }); + } + ASSERT_OK(Flush()); + } + + manual_compaction_thread.join(); + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + if (enable_preclude_last_level) { + ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel()); + } else { + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + } + ASSERT_TRUE(verified_compaction_order); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + stop_token.reset(); + + Close(); +} + +INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms, + PrecludeLastLevelTestWithParms, testing::Bool()); + +// partition the SST into 3 ranges [0, 19] [20, 39] [40, ...] +class ThreeRangesPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& request) override { + if ((cmp->CompareWithoutTimestamp(*request.current_user_key, + DBTestBase::Key(20)) >= 0 && + cmp->CompareWithoutTimestamp(*request.prev_user_key, + DBTestBase::Key(20)) < 0) || + (cmp->CompareWithoutTimestamp(*request.current_user_key, + DBTestBase::Key(40)) >= 0 && + cmp->CompareWithoutTimestamp(*request.prev_user_key, + DBTestBase::Key(40)) < 0)) { + return kRequired; + } else { + return kNotRequired; + } + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } + + const Comparator* cmp = BytewiseComparator(); +}; + +class ThreeRangesPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { + return "TombstoneTestSstPartitionerFactory"; + } + const char* Name() const override { return kClassName(); } + + std::unique_ptr<SstPartitioner> CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr<SstPartitioner>(new ThreeRangesPartitioner()); + } +}; + +TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); }); + + Random rnd(301); + + for (int i = 0; i < 300; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // make sure all data is compacted to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // Create 3 L5 files + auto factory = std::make_shared<ThreeRangesPartitionerFactory>(); + options.sst_partitioner_factory = factory; + + Reopen(options); + + for (int i = 0; i < kNumTrigger - 1; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + // L5: [0,19] [20,39] [40,299] + // L6: [0, 299] + ASSERT_EQ("0,0,0,0,0,3,1", FilesPerLevel()); + + // enable tiered storage feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + options.statistics = CreateDBStatistics(); + Reopen(options); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[5].files.size(), 3); + ASSERT_EQ(meta.levels[6].files.size(), 1); + ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0)); + ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(299)); + + std::string file_path = meta.levels[5].files[1].db_path; + std::vector<std::string> files; + // pick 3rd file @L5 + file@L6 for compaction + files.push_back(file_path + "/" + meta.levels[5].files[2].name); + files.push_back(file_path + "/" + meta.levels[6].files[0].name); + ASSERT_OK(db_->CompactFiles(CompactionOptions(), files, 6)); + + // The compaction only moved partial of the hot data to hot tier, range[0,39] + // is unsafe to move up, otherwise, they will be overlapped with the existing + // files@L5. + // The output should be: + // L5: [0,19] [20,39] [40,299] <-- Temperature::kUnknown + // L6: [0,19] [20,39] <-- Temperature::kCold + // L6 file is split because of the customized partitioner + ASSERT_EQ("0,0,0,0,0,3,2", FilesPerLevel()); + + // even all the data is hot, but not all data are moved to the hot tier + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + db_->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[5].files.size(), 3); + ASSERT_EQ(meta.levels[6].files.size(), 2); + for (const auto& file : meta.levels[5].files) { + ASSERT_EQ(file.temperature, Temperature::kUnknown); + } + for (const auto& file : meta.levels[6].files) { + ASSERT_EQ(file.temperature, Temperature::kCold); + } + ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0)); + ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(19)); + ASSERT_EQ(meta.levels[6].files[1].smallestkey, Key(20)); + ASSERT_EQ(meta.levels[6].files[1].largestkey, Key(39)); + + Close(); +} + +struct TestPropertiesCollector : public TablePropertiesCollector { + Status AddUserKey(const Slice& key, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + if (cmp->Compare(key, DBTestBase::Key(100)) == 0) { + has_key_100 = true; + } + if (cmp->Compare(key, DBTestBase::Key(200)) == 0) { + has_key_200 = true; + } + + return Status::OK(); + } + + const char* Name() const override { return "TestTablePropertiesCollector"; } + + UserCollectedProperties GetReadableProperties() const override { + UserCollectedProperties ret; + return ret; + } + + Status Finish(UserCollectedProperties* /*properties*/) override { + // The LSM tree would be like: + // L5: [0,19] [20,39] [40,299] + // L6: [0, 299] + // the 3rd file @L5 has both 100 and 200, which will be marked for + // compaction + // Also avoid marking flushed SST for compaction, which won't have both 100 + // and 200 + if (has_key_100 && has_key_200) { + need_compact_ = true; + } else { + need_compact_ = false; + } + has_key_100 = false; + has_key_200 = false; + return Status::OK(); + } + + bool NeedCompact() const override { return need_compact_; } + + const Comparator* cmp = BytewiseComparator(); + + private: + bool has_key_100 = false; + bool has_key_200 = false; + + bool need_compact_ = false; +}; + +class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new TestPropertiesCollector; + } + const char* Name() const override { return "TestTablePropertiesCollector"; } +}; + +TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) { + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kKeyPerSec = 10; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.preserve_internal_time_seconds = 10000; + options.num_levels = kNumLevels; + // set a small max_compaction_bytes to avoid input level expansion + options.max_compaction_bytes = 30000; + options.ignore_max_compaction_bytes_for_input = false; + DestroyAndReopen(options); + + // pass some time first, otherwise the first a few keys write time are going + // to be zero, and internally zero has special meaning: kUnknownSeqnoTime + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); }); + + Random rnd(301); + + for (int i = 0; i < 300; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeridicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // make sure all data is compacted to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // Create 3 L5 files + auto factory = std::make_shared<ThreeRangesPartitionerFactory>(); + options.sst_partitioner_factory = factory; + + // the user defined properties_collector will mark the 3rd file for compaction + auto collector_factory = std::make_shared<TestPropertiesCollectorFactory>(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + // enable tiered storage feature + options.preclude_last_level_data_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + for (int i = 0; i < kNumTrigger - 2; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10))); + } + ASSERT_OK(Flush()); + } + + // make sure there is one and only one compaction supports per-key placement + // but has the penultimate level output disabled. + std::atomic_int per_key_comp_num = 0; + SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + auto compaction = static_cast<Compaction*>(arg); + if (compaction->SupportsPerKeyPlacement()) { + ASSERT_EQ(compaction->GetPenultimateOutputRangeType(), + Compaction::PenultimateOutputRangeType::kDisabled); + per_key_comp_num++; + } + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10))); + } + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(32), Key(40))); + ASSERT_OK(Flush()); + + // Before the per-key placement compaction, the LSM tress should be like: + // L5: [0,19] [20,40] [40,299] + // L6: [0, 299] + // The 2nd file @L5 has the largest key 40 because of range del + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + ASSERT_EQ(per_key_comp_num, 1); + + // the compaction won't move any data to the penultimate level + ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel()); + + Close(); +} + +#endif // !defined(ROCKSDB_LITE) + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} |