summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/compaction
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/db/compaction')
-rw-r--r--src/rocksdb/db/compaction/clipping_iterator.h276
-rw-r--r--src/rocksdb/db/compaction/clipping_iterator_test.cc259
-rw-r--r--src/rocksdb/db/compaction/compaction.cc855
-rw-r--r--src/rocksdb/db/compaction/compaction.h559
-rw-r--r--src/rocksdb/db/compaction/compaction_iteration_stats.h49
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.cc1338
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator.h513
-rw-r--r--src/rocksdb/db/compaction/compaction_iterator_test.cc1618
-rw-r--r--src/rocksdb/db/compaction/compaction_job.cc2060
-rw-r--r--src/rocksdb/db/compaction/compaction_job.h500
-rw-r--r--src/rocksdb/db/compaction/compaction_job_stats_test.cc975
-rw-r--r--src/rocksdb/db/compaction/compaction_job_test.cc2451
-rw-r--r--src/rocksdb/db/compaction/compaction_outputs.cc646
-rw-r--r--src/rocksdb/db/compaction/compaction_outputs.h385
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.cc1234
-rw-r--r--src/rocksdb/db/compaction/compaction_picker.h323
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.cc433
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_fifo.h63
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.cc841
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_level.h33
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_test.cc3964
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.cc1450
-rw-r--r--src/rocksdb/db/compaction/compaction_picker_universal.h32
-rw-r--r--src/rocksdb/db/compaction/compaction_service_job.cc829
-rw-r--r--src/rocksdb/db/compaction/compaction_service_test.cc966
-rw-r--r--src/rocksdb/db/compaction/compaction_state.cc46
-rw-r--r--src/rocksdb/db/compaction/compaction_state.h42
-rw-r--r--src/rocksdb/db/compaction/file_pri.h92
-rw-r--r--src/rocksdb/db/compaction/sst_partitioner.cc90
-rw-r--r--src/rocksdb/db/compaction/subcompaction_state.cc106
-rw-r--r--src/rocksdb/db/compaction/subcompaction_state.h214
-rw-r--r--src/rocksdb/db/compaction/tiered_compaction_test.cc2028
32 files changed, 25270 insertions, 0 deletions
diff --git a/src/rocksdb/db/compaction/clipping_iterator.h b/src/rocksdb/db/compaction/clipping_iterator.h
new file mode 100644
index 000000000..1ed465c2c
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator.h
@@ -0,0 +1,276 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/comparator.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that wraps another one and ensures that any keys
+// returned are strictly within a range [start, end). If the underlying
+// iterator has already performed the bounds checking, it relies on that result;
+// otherwise, it performs the necessary key comparisons itself. Both bounds
+// are optional.
+class ClippingIterator : public InternalIterator {
+ public:
+ ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
+ const CompareInterface* cmp)
+ : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+ assert(iter_);
+ assert(cmp_);
+ assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+
+ UpdateAndEnforceBounds();
+ }
+
+ bool Valid() const override { return valid_; }
+
+ void SeekToFirst() override {
+ if (start_) {
+ iter_->Seek(*start_);
+ } else {
+ iter_->SeekToFirst();
+ }
+
+ UpdateAndEnforceUpperBound();
+ }
+
+ void SeekToLast() override {
+ if (end_) {
+ iter_->SeekForPrev(*end_);
+
+ // Upper bound is exclusive, so we need a key which is strictly smaller
+ if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+ iter_->Prev();
+ }
+ } else {
+ iter_->SeekToLast();
+ }
+
+ UpdateAndEnforceLowerBound();
+ }
+
+ void Seek(const Slice& target) override {
+ if (start_ && cmp_->Compare(target, *start_) < 0) {
+ iter_->Seek(*start_);
+ UpdateAndEnforceUpperBound();
+ return;
+ }
+
+ if (end_ && cmp_->Compare(target, *end_) >= 0) {
+ valid_ = false;
+ return;
+ }
+
+ iter_->Seek(target);
+ UpdateAndEnforceUpperBound();
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ if (start_ && cmp_->Compare(target, *start_) < 0) {
+ valid_ = false;
+ return;
+ }
+
+ if (end_ && cmp_->Compare(target, *end_) >= 0) {
+ iter_->SeekForPrev(*end_);
+
+ // Upper bound is exclusive, so we need a key which is strictly smaller
+ if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+ iter_->Prev();
+ }
+
+ UpdateAndEnforceLowerBound();
+ return;
+ }
+
+ iter_->SeekForPrev(target);
+ UpdateAndEnforceLowerBound();
+ }
+
+ void Next() override {
+ assert(valid_);
+ iter_->Next();
+ UpdateAndEnforceUpperBound();
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ assert(valid_);
+ assert(result);
+
+ IterateResult res;
+ valid_ = iter_->NextAndGetResult(&res);
+
+ if (!valid_) {
+ return false;
+ }
+
+ if (end_) {
+ EnforceUpperBoundImpl(res.bound_check_result);
+
+ if (!valid_) {
+ return false;
+ }
+ }
+
+ res.bound_check_result = IterBoundCheck::kInbound;
+ *result = res;
+
+ return true;
+ }
+
+ void Prev() override {
+ assert(valid_);
+ iter_->Prev();
+ UpdateAndEnforceLowerBound();
+ }
+
+ Slice key() const override {
+ assert(valid_);
+ return iter_->key();
+ }
+
+ Slice user_key() const override {
+ assert(valid_);
+ return iter_->user_key();
+ }
+
+ Slice value() const override {
+ assert(valid_);
+ return iter_->value();
+ }
+
+ Status status() const override { return iter_->status(); }
+
+ bool PrepareValue() override {
+ assert(valid_);
+
+ if (iter_->PrepareValue()) {
+ return true;
+ }
+
+ assert(!iter_->Valid());
+ valid_ = false;
+ return false;
+ }
+
+ bool MayBeOutOfLowerBound() override {
+ assert(valid_);
+ return false;
+ }
+
+ IterBoundCheck UpperBoundCheckResult() override {
+ assert(valid_);
+ return IterBoundCheck::kInbound;
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ iter_->SetPinnedItersMgr(pinned_iters_mgr);
+ }
+
+ bool IsKeyPinned() const override {
+ assert(valid_);
+ return iter_->IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ assert(valid_);
+ return iter_->IsValuePinned();
+ }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override {
+ return iter_->GetProperty(prop_name, prop);
+ }
+
+ private:
+ void UpdateValid() {
+ assert(!iter_->Valid() || iter_->status().ok());
+
+ valid_ = iter_->Valid();
+ }
+
+ void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
+ if (bound_check_result == IterBoundCheck::kInbound) {
+ return;
+ }
+
+ if (bound_check_result == IterBoundCheck::kOutOfBound) {
+ valid_ = false;
+ return;
+ }
+
+ assert(bound_check_result == IterBoundCheck::kUnknown);
+
+ if (cmp_->Compare(key(), *end_) >= 0) {
+ valid_ = false;
+ }
+ }
+
+ void EnforceUpperBound() {
+ if (!valid_) {
+ return;
+ }
+
+ if (!end_) {
+ return;
+ }
+
+ EnforceUpperBoundImpl(iter_->UpperBoundCheckResult());
+ }
+
+ void EnforceLowerBound() {
+ if (!valid_) {
+ return;
+ }
+
+ if (!start_) {
+ return;
+ }
+
+ if (!iter_->MayBeOutOfLowerBound()) {
+ return;
+ }
+
+ if (cmp_->Compare(key(), *start_) < 0) {
+ valid_ = false;
+ }
+ }
+
+ void AssertBounds() {
+ assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
+ assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+ }
+
+ void UpdateAndEnforceBounds() {
+ UpdateValid();
+ EnforceUpperBound();
+ EnforceLowerBound();
+ AssertBounds();
+ }
+
+ void UpdateAndEnforceUpperBound() {
+ UpdateValid();
+ EnforceUpperBound();
+ AssertBounds();
+ }
+
+ void UpdateAndEnforceLowerBound() {
+ UpdateValid();
+ EnforceLowerBound();
+ AssertBounds();
+ }
+
+ InternalIterator* iter_;
+ const Slice* start_;
+ const Slice* end_;
+ const CompareInterface* cmp_;
+ bool valid_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/clipping_iterator_test.cc b/src/rocksdb/db/compaction/clipping_iterator_test.cc
new file mode 100644
index 000000000..b2b167048
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator_test.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/clipping_iterator.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A vector iterator which does its own bounds checking. This is for testing the
+// optimizations in the clipping iterator where we bypass the bounds checking if
+// the input iterator has already performed it.
+class BoundsCheckingVectorIterator : public VectorIterator {
+ public:
+ BoundsCheckingVectorIterator(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values,
+ const Slice* start, const Slice* end,
+ const Comparator* cmp)
+ : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) {
+ assert(cmp_);
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ assert(Valid());
+ assert(result);
+
+ Next();
+
+ if (!Valid()) {
+ return false;
+ }
+
+ result->key = key();
+ result->bound_check_result = UpperBoundCheckResult();
+ result->value_prepared = true;
+
+ return true;
+ }
+
+ bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+
+ if (!start_) {
+ return false;
+ }
+
+ return cmp_->Compare(key(), *start_) < 0;
+ }
+
+ IterBoundCheck UpperBoundCheckResult() override {
+ assert(Valid());
+
+ if (!end_) {
+ return IterBoundCheck::kInbound;
+ }
+
+ return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound
+ : IterBoundCheck::kInbound;
+ }
+
+ private:
+ const Slice* start_;
+ const Slice* end_;
+ const Comparator* cmp_;
+};
+
+class ClippingIteratorTest
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {};
+
+TEST_P(ClippingIteratorTest, Clip) {
+ const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4",
+ "key5", "key6", "key7", "key8", "key9"};
+ const std::vector<std::string> values{
+ "unused0", "value1", "value2", "value3", "unused4",
+ "unused5", "unused6", "unused7", "unused8", "unused9"};
+
+ assert(keys.size() == values.size());
+
+ // Note: the input always contains key1, key2, and key3; however, the clipping
+ // window is based on the test parameters: its left edge is a value in the
+ // range [0, 4], and its size is a value in the range [0, 5]
+ const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]};
+ const std::vector<std::string> input_values{values[1], values[2], values[3]};
+
+ const bool use_bounds_checking_vec_it = std::get<0>(GetParam());
+
+ const size_t clip_start_idx = std::get<1>(GetParam());
+ const size_t clip_window_size = std::get<2>(GetParam());
+ const size_t clip_end_idx = clip_start_idx + clip_window_size;
+
+ const Slice start(keys[clip_start_idx]);
+ const Slice end(keys[clip_end_idx]);
+
+ std::unique_ptr<InternalIterator> input(
+ use_bounds_checking_vec_it
+ ? new BoundsCheckingVectorIterator(input_keys, input_values, &start,
+ &end, BytewiseComparator())
+ : new VectorIterator(input_keys, input_values, BytewiseComparator()));
+
+ ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+
+ // The range the clipping iterator should return values from. This is
+ // essentially the intersection of the input range [1, 4) and the clipping
+ // window [clip_start_idx, clip_end_idx)
+ const size_t data_start_idx =
+ std::max(clip_start_idx, static_cast<size_t>(1));
+ const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4));
+
+ // Range is empty; all Seeks should fail
+ if (data_start_idx >= data_end_idx) {
+ clip.SeekToFirst();
+ ASSERT_FALSE(clip.Valid());
+
+ clip.SeekToLast();
+ ASSERT_FALSE(clip.Valid());
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ clip.Seek(keys[i]);
+ ASSERT_FALSE(clip.Valid());
+
+ clip.SeekForPrev(keys[i]);
+ ASSERT_FALSE(clip.Valid());
+ }
+
+ return;
+ }
+
+ // Range is non-empty; call SeekToFirst and iterate forward
+ clip.SeekToFirst();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_start_idx]);
+ ASSERT_EQ(clip.value(), values[data_start_idx]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+ for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+ clip.Next();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+
+ clip.Next();
+ ASSERT_FALSE(clip.Valid());
+
+ // Do it again using NextAndGetResult
+ clip.SeekToFirst();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_start_idx]);
+ ASSERT_EQ(clip.value(), values[data_start_idx]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+ for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+ IterateResult result;
+ ASSERT_TRUE(clip.NextAndGetResult(&result));
+ ASSERT_EQ(result.key, keys[i]);
+ ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+
+ IterateResult result;
+ ASSERT_FALSE(clip.NextAndGetResult(&result));
+ ASSERT_FALSE(clip.Valid());
+
+ // Call SeekToLast and iterate backward
+ clip.SeekToLast();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+ ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+ for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) {
+ clip.Prev();
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+
+ clip.Prev();
+ ASSERT_FALSE(clip.Valid());
+
+ // Call Seek/SeekForPrev for all keys; Seek should return the smallest key
+ // which is >= the target; SeekForPrev should return the largest key which is
+ // <= the target
+ for (size_t i = 0; i < keys.size(); ++i) {
+ clip.Seek(keys[i]);
+
+ if (i < data_start_idx) {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_start_idx]);
+ ASSERT_EQ(clip.value(), values[data_start_idx]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ } else if (i < data_end_idx) {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ } else {
+ ASSERT_FALSE(clip.Valid());
+ }
+
+ clip.SeekForPrev(keys[i]);
+
+ if (i < data_start_idx) {
+ ASSERT_FALSE(clip.Valid());
+ } else if (i < data_end_idx) {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[i]);
+ ASSERT_EQ(clip.value(), values[i]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ } else {
+ ASSERT_TRUE(clip.Valid());
+ ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+ ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+ ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+ ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ ClippingIteratorTest, ClippingIteratorTest,
+ ::testing::Combine(
+ ::testing::Bool(),
+ ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)),
+ ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6))));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..a32b529f7
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,855 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/sst_partitioner.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+ PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b) {
+ auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
+ if (c != 0) {
+ return c;
+ }
+ auto a_footer = ExtractInternalKeyFooter(a.Encode());
+ auto b_footer = ExtractInternalKeyFooter(b.Encode());
+ if (a_footer == kRangeTombstoneSentinel) {
+ if (b_footer != kRangeTombstoneSentinel) {
+ return -1;
+ }
+ } else if (b_footer == kRangeTombstoneSentinel) {
+ return 1;
+ }
+ return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b) {
+ if (a == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b) {
+ if (b == nullptr) {
+ return -1;
+ }
+ return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->fd.GetFileSize();
+ }
+ return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+ input_version_ = _input_version;
+ cfd_ = input_version_->cfd();
+
+ cfd_->Ref();
+ input_version_->Ref();
+ edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+ Slice* largest_user_key, int exclude_level) {
+ bool initialized = false;
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
+ continue;
+ }
+ if (inputs[i].level == 0) {
+ // we need to consider all files on level 0
+ for (const auto* f : inputs[i].files) {
+ const Slice& start_user_key = f->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = f->largest.user_key();
+ if (!initialized ||
+ ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ } else {
+ // we only need to consider the first and last file
+ const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+ if (!initialized ||
+ ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+ *smallest_user_key = start_user_key;
+ }
+ const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+ if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+ *largest_user_key = end_user_key;
+ }
+ initialized = true;
+ }
+ }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+ const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ if (inputs[i].level == 0 || inputs[i].files.empty()) {
+ continue;
+ }
+ inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+ AtomicCompactionUnitBoundary cur_boundary;
+ size_t first_atomic_idx = 0;
+ auto add_unit_boundary = [&](size_t to) {
+ if (first_atomic_idx == to) return;
+ for (size_t k = first_atomic_idx; k < to; k++) {
+ inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+ }
+ first_atomic_idx = to;
+ };
+ for (size_t j = 0; j < inputs[i].files.size(); j++) {
+ const auto* f = inputs[i].files[j];
+ if (j == 0) {
+ // First file in a level.
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+ 0) {
+ // SSTs overlap but the end key of the previous file was not
+ // artificially extended by a range tombstone. Extend the current
+ // boundary.
+ cur_boundary.largest = &f->largest;
+ } else {
+ // Atomic compaction unit has ended.
+ add_unit_boundary(j);
+ cur_boundary.smallest = &f->smallest;
+ cur_boundary.largest = &f->largest;
+ }
+ }
+ add_unit_boundary(inputs[i].files.size());
+ assert(inputs[i].files.size() ==
+ inputs[i].atomic_compaction_unit_boundaries.size());
+ }
+ return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ int output_l0_idx;
+ if (output_level == 0) {
+ output_l0_idx = 0;
+ for (const auto* file : vstorage->LevelFiles(0)) {
+ if (inputs[0].files.back() == file) {
+ break;
+ }
+ ++output_l0_idx;
+ }
+ assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+ } else {
+ output_l0_idx = -1;
+ }
+ Slice smallest_key, largest_key;
+ GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+ return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+ output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+ VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs) {
+ size_t num_files_in_compaction = 0;
+ size_t total_num_files = 0;
+ for (int l = 0; l < vstorage->num_levels(); l++) {
+ total_num_files += vstorage->NumLevelFiles(l);
+ }
+ for (size_t i = 0; i < inputs.size(); i++) {
+ num_files_in_compaction += inputs[i].size();
+ }
+ return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(
+ VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
+ const MutableCFOptions& _mutable_cf_options,
+ const MutableDBOptions& _mutable_db_options,
+ std::vector<CompactionInputFiles> _inputs, int _output_level,
+ uint64_t _target_file_size, uint64_t _max_compaction_bytes,
+ uint32_t _output_path_id, CompressionType _compression,
+ CompressionOptions _compression_opts, Temperature _output_temperature,
+ uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+ bool _manual_compaction, const std::string& _trim_ts, double _score,
+ bool _deletion_compaction, bool l0_files_might_overlap,
+ CompactionReason _compaction_reason,
+ BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
+ double _blob_garbage_collection_age_cutoff)
+ : input_vstorage_(vstorage),
+ start_level_(_inputs[0].level),
+ output_level_(_output_level),
+ target_output_file_size_(_target_file_size),
+ max_compaction_bytes_(_max_compaction_bytes),
+ max_subcompactions_(_max_subcompactions),
+ immutable_options_(_immutable_options),
+ mutable_cf_options_(_mutable_cf_options),
+ input_version_(nullptr),
+ number_levels_(vstorage->num_levels()),
+ cfd_(nullptr),
+ output_path_id_(_output_path_id),
+ output_compression_(_compression),
+ output_compression_opts_(_compression_opts),
+ output_temperature_(_output_temperature),
+ deletion_compaction_(_deletion_compaction),
+ l0_files_might_overlap_(l0_files_might_overlap),
+ inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+ grandparents_(std::move(_grandparents)),
+ score_(_score),
+ bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+ is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+ is_manual_compaction_(_manual_compaction),
+ trim_ts_(_trim_ts),
+ is_trivial_move_(false),
+
+ compaction_reason_(_compaction_reason),
+ notify_on_compaction_completion_(false),
+ enable_blob_garbage_collection_(
+ _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce
+ ? true
+ : (_blob_garbage_collection_policy ==
+ BlobGarbageCollectionPolicy::kDisable
+ ? false
+ : mutable_cf_options()->enable_blob_garbage_collection)),
+ blob_garbage_collection_age_cutoff_(
+ _blob_garbage_collection_age_cutoff < 0 ||
+ _blob_garbage_collection_age_cutoff > 1
+ ? mutable_cf_options()->blob_garbage_collection_age_cutoff
+ : _blob_garbage_collection_age_cutoff),
+ penultimate_level_(EvaluatePenultimateLevel(
+ vstorage, immutable_options_, start_level_, output_level_)) {
+ MarkFilesBeingCompacted(true);
+ if (is_manual_compaction_) {
+ compaction_reason_ = CompactionReason::kManualCompaction;
+ }
+ if (max_subcompactions_ == 0) {
+ max_subcompactions_ = _mutable_db_options.max_subcompactions;
+ }
+
+ // for the non-bottommost levels, it tries to build files match the target
+ // file size, but not guaranteed. It could be 2x the size of the target size.
+ max_output_file_size_ =
+ bottommost_level_ || grandparents_.empty() ||
+ !_immutable_options.level_compaction_dynamic_file_size
+ ? target_output_file_size_
+ : 2 * target_output_file_size_;
+
+#ifndef NDEBUG
+ for (size_t i = 1; i < inputs_.size(); ++i) {
+ assert(inputs_[i].level > inputs_[i - 1].level);
+ }
+#endif
+
+ // setup input_levels_
+ {
+ input_levels_.resize(num_input_levels());
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+ &arena_);
+ }
+ }
+
+ GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+
+ // Every compaction regardless of any compaction reason may respect the
+ // existing compact cursor in the output level to split output files
+ output_split_key_ = nullptr;
+ if (immutable_options_.compaction_style == kCompactionStyleLevel &&
+ immutable_options_.compaction_pri == kRoundRobin) {
+ const InternalKey* cursor =
+ &input_vstorage_->GetCompactCursors()[output_level_];
+ if (cursor->size() != 0) {
+ const Slice& cursor_user_key = ExtractUserKey(cursor->Encode());
+ auto ucmp = vstorage->InternalComparator()->user_comparator();
+ // May split output files according to the cursor if it in the user-key
+ // range
+ if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) >
+ 0 &&
+ ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <=
+ 0) {
+ output_split_key_ = cursor;
+ }
+ }
+ }
+
+ PopulatePenultimateLevelOutputRange();
+}
+
+void Compaction::PopulatePenultimateLevelOutputRange() {
+ if (!SupportsPerKeyPlacement()) {
+ return;
+ }
+
+ // exclude the last level, the range of all input levels is the safe range
+ // of keys that can be moved up.
+ int exclude_level = number_levels_ - 1;
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
+
+ // For universal compaction, the penultimate_output_range could be extended if
+ // all penultimate level files are included in the compaction (which includes
+ // the case that the penultimate level is empty).
+ if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
+ exclude_level = kInvalidLevel;
+ std::set<uint64_t> penultimate_inputs;
+ for (const auto& input_lvl : inputs_) {
+ if (input_lvl.level == penultimate_level_) {
+ for (const auto& file : input_lvl.files) {
+ penultimate_inputs.emplace(file->fd.GetNumber());
+ }
+ }
+ }
+ auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+ for (const auto& file : penultimate_files) {
+ if (penultimate_inputs.find(file->fd.GetNumber()) ==
+ penultimate_inputs.end()) {
+ exclude_level = number_levels_ - 1;
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
+ break;
+ }
+ }
+ }
+
+ GetBoundaryKeys(input_vstorage_, inputs_,
+ &penultimate_level_smallest_user_key_,
+ &penultimate_level_largest_user_key_, exclude_level);
+
+ // If there's a case that the penultimate level output range is overlapping
+ // with the existing files, disable the penultimate level output by setting
+ // the range to empty. One example is the range delete could have overlap
+ // boundary with the next file. (which is actually a false overlap)
+ // TODO: Exclude such false overlap, so it won't disable the penultimate
+ // output.
+ std::set<uint64_t> penultimate_inputs;
+ for (const auto& input_lvl : inputs_) {
+ if (input_lvl.level == penultimate_level_) {
+ for (const auto& file : input_lvl.files) {
+ penultimate_inputs.emplace(file->fd.GetNumber());
+ }
+ }
+ }
+
+ auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+ for (const auto& file : penultimate_files) {
+ if (penultimate_inputs.find(file->fd.GetNumber()) ==
+ penultimate_inputs.end() &&
+ OverlapPenultimateLevelOutputRange(file->smallest.user_key(),
+ file->largest.user_key())) {
+ // basically disable the penultimate range output. which should be rare
+ // or a false overlap caused by range del
+ penultimate_level_smallest_user_key_ = "";
+ penultimate_level_largest_user_key_ = "";
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled;
+ }
+ }
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != nullptr) {
+ input_version_->Unref();
+ }
+ if (cfd_ != nullptr) {
+ cfd_->UnrefAndTryDelete();
+ }
+}
+
+bool Compaction::SupportsPerKeyPlacement() const {
+ return penultimate_level_ != kInvalidLevel;
+}
+
+int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
+
+// smallest_key and largest_key include timestamps if user-defined timestamp is
+// enabled.
+bool Compaction::OverlapPenultimateLevelOutputRange(
+ const Slice& smallest_key, const Slice& largest_key) const {
+ if (!SupportsPerKeyPlacement()) {
+ return false;
+ }
+ const Comparator* ucmp =
+ input_vstorage_->InternalComparator()->user_comparator();
+
+ return ucmp->CompareWithoutTimestamp(
+ smallest_key, penultimate_level_largest_user_key_) <= 0 &&
+ ucmp->CompareWithoutTimestamp(
+ largest_key, penultimate_level_smallest_user_key_) >= 0;
+}
+
+// key includes timestamp if user-defined timestamp is enabled.
+bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
+ if (!SupportsPerKeyPlacement()) {
+ return false;
+ }
+
+ if (penultimate_level_smallest_user_key_.empty() ||
+ penultimate_level_largest_user_key_.empty()) {
+ return false;
+ }
+
+ const Comparator* ucmp =
+ input_vstorage_->InternalComparator()->user_comparator();
+
+ return ucmp->CompareWithoutTimestamp(
+ key, penultimate_level_smallest_user_key_) >= 0 &&
+ ucmp->CompareWithoutTimestamp(
+ key, penultimate_level_largest_user_key_) <= 0;
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+ int base_level = input_vstorage_->base_level();
+ bool matches =
+ (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_,
+ base_level) == output_compression_);
+ if (matches) {
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+ return true;
+ }
+ TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+ return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+ // Avoid a move if there is lots of overlapping grandparent data.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ // If start_level_== output_level_, the purpose is to force compaction
+ // filter to be applied to that level, and thus cannot be a trivial move.
+
+ // Check if start level have files with overlapping ranges
+ if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false &&
+ l0_files_might_overlap_) {
+ // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are
+ // overlapping, unless we are sure that files picked in L0 don't overlap.
+ return false;
+ }
+
+ if (is_manual_compaction_ &&
+ (immutable_options_.compaction_filter != nullptr ||
+ immutable_options_.compaction_filter_factory != nullptr)) {
+ // This is a manual compaction and we have a compaction filter that should
+ // be executed, we cannot do a trivial move
+ return false;
+ }
+
+ if (start_level_ == output_level_) {
+ // It doesn't make sense if compaction picker picks files just to trivial
+ // move to the same level.
+ return false;
+ }
+
+ // Used in universal compaction, where trivial move can be done if the
+ // input files are non overlapping
+ if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+ (output_level_ != 0) &&
+ (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) {
+ return is_trivial_move_;
+ }
+
+ if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+ input(0, 0)->fd.GetPathId() == output_path_id() &&
+ InputCompressionMatchesOutput())) {
+ return false;
+ }
+
+ // assert inputs_.size() == 1
+
+ std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+
+ for (const auto& file : inputs_.front().files) {
+ std::vector<FileMetaData*> file_grand_parents;
+ if (output_level_ + 1 >= number_levels_) {
+ continue;
+ }
+ input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+ &file->largest, &file_grand_parents);
+ const auto compaction_size =
+ file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+ if (compaction_size > max_compaction_bytes_) {
+ return false;
+ }
+
+ if (partitioner.get() != nullptr) {
+ if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+ file->largest.user_key())) {
+ return false;
+ }
+ }
+ }
+
+ // PerKeyPlacement compaction should never be trivial move.
+ if (SupportsPerKeyPlacement()) {
+ return false;
+ }
+
+ return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+ for (size_t which = 0; which < num_input_levels(); which++) {
+ for (size_t i = 0; i < inputs_[which].size(); i++) {
+ out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+ }
+ }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+ assert(input_version_ != nullptr);
+ assert(level_ptrs != nullptr);
+ assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+ if (bottommost_level_) {
+ return true;
+ } else if (output_level_ != 0 &&
+ cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = cfd_->user_comparator();
+ for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+ const std::vector<FileMetaData*>& files =
+ input_vstorage_->LevelFiles(lvl);
+ for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+ auto* f = files[level_ptrs->at(lvl)];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ // In the presence of user-defined timestamp, we may need to handle
+ // the case in which f->smallest.user_key() (including ts) has the
+ // same user key, but the ts part is smaller. If so,
+ // Compare(user_key, f->smallest.user_key()) returns -1.
+ // That's why we need CompareWithoutTimestamp().
+ if (user_cmp->CompareWithoutTimestamp(user_key,
+ f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so it may
+ // exist beyond output level
+ return false;
+ }
+ break;
+ }
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+ for (size_t i = 0; i < num_input_levels(); i++) {
+ for (size_t j = 0; j < inputs_[i].size(); j++) {
+ assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+ : inputs_[i][j]->being_compacted);
+ inputs_[i][j]->being_compacted = mark_as_compacted;
+ }
+ }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+ InputLevelSummaryBuffer* scratch) const {
+ int len = 0;
+ bool is_first = true;
+ for (auto& input_level : inputs_) {
+ if (input_level.empty()) {
+ continue;
+ }
+ if (!is_first) {
+ len +=
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ } else {
+ is_first = false;
+ }
+ len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+ input_level.level);
+ len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+ }
+ snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+ " files to L%d", output_level());
+
+ return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+ uint64_t size = 0;
+ for (auto& input_level : inputs_) {
+ for (auto f : input_level.files) {
+ size += f->fd.GetFileSize();
+ }
+ }
+ return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+ MarkFilesBeingCompacted(false);
+ cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+ assert(input_version_ != nullptr);
+ input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+ int len) {
+ *output = '\0';
+ int write = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ int sz = len - write;
+ int ret;
+ char sztxt[16];
+ AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+ ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+ files.at(i)->fd.GetNumber(), sztxt);
+ if (ret < 0 || ret >= sz) break;
+ write += ret;
+ }
+ // if files.size() is non-zero, overwrite the last space
+ return write - !!files.size();
+}
+} // namespace
+
+void Compaction::Summary(char* output, int len) {
+ int write =
+ snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+ input_version_->GetVersionNumber(), start_level_);
+ if (write < 0 || write >= len) {
+ return;
+ }
+
+ for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+ if (level_iter > 0) {
+ write += snprintf(output + write, len - write, "], [");
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+ write +=
+ InputSummary(inputs_[level_iter].files, output + write, len - write);
+ if (write < 0 || write >= len) {
+ return;
+ }
+ }
+
+ snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+ uint64_t preallocation_size = 0;
+
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ preallocation_size += file->fd.GetFileSize();
+ }
+ }
+
+ if (max_output_file_size_ != std::numeric_limits<uint64_t>::max() &&
+ (immutable_options_.compaction_style == kCompactionStyleLevel ||
+ output_level() > 0)) {
+ preallocation_size = std::min(max_output_file_size_, preallocation_size);
+ }
+
+ // Over-estimate slightly so we don't end up just barely crossing
+ // the threshold
+ // No point to preallocate more than 1GB.
+ return std::min(uint64_t{1073741824},
+ preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+ if (!cfd_->ioptions()->compaction_filter_factory) {
+ return nullptr;
+ }
+
+ if (!cfd_->ioptions()
+ ->compaction_filter_factory->ShouldFilterTableFileCreation(
+ TableFileCreationReason::kCompaction)) {
+ return nullptr;
+ }
+
+ CompactionFilter::Context context;
+ context.is_full_compaction = is_full_compaction_;
+ context.is_manual_compaction = is_manual_compaction_;
+ context.column_family_id = cfd_->GetID();
+ context.reason = TableFileCreationReason::kCompaction;
+ return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+ context);
+}
+
+std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
+ if (!immutable_options_.sst_partitioner_factory) {
+ return nullptr;
+ }
+
+ SstPartitioner::Context context;
+ context.is_full_compaction = is_full_compaction_;
+ context.is_manual_compaction = is_manual_compaction_;
+ context.output_level = output_level_;
+ context.smallest_user_key = smallest_user_key_;
+ context.largest_user_key = largest_user_key_;
+ return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+ return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+ if (cfd_ == nullptr) {
+ return false;
+ }
+
+ // Round-Robin pri under leveled compaction allows subcompactions by default
+ // and the number of subcompactions can be larger than max_subcompactions_
+ if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
+ cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ return output_level_ > 0;
+ }
+
+ if (max_subcompactions_ <= 1) {
+ return false;
+ }
+
+ if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+ return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0;
+ } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+ return number_levels_ > 1 && output_level_ > 0;
+ } else {
+ return false;
+ }
+}
+
+bool Compaction::DoesInputReferenceBlobFiles() const {
+ assert(input_version_);
+
+ const VersionStorageInfo* storage_info = input_version_->storage_info();
+ assert(storage_info);
+
+ if (storage_info->GetBlobFiles().empty()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < inputs_.size(); ++i) {
+ for (const FileMetaData* meta : inputs_[i].files) {
+ assert(meta);
+
+ if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime(
+ const InternalKey* start, const InternalKey* end) const {
+ uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
+ const InternalKeyComparator& icmp =
+ column_family_data()->internal_comparator();
+ for (const auto& level_files : inputs_) {
+ for (const auto& file : level_files.files) {
+ if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
+ continue;
+ }
+ if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
+ continue;
+ }
+ uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+ if (oldest_ancester_time != 0) {
+ min_oldest_ancester_time =
+ std::min(min_oldest_ancester_time, oldest_ancester_time);
+ }
+ }
+ }
+ return min_oldest_ancester_time;
+}
+
+int Compaction::EvaluatePenultimateLevel(
+ const VersionStorageInfo* vstorage,
+ const ImmutableOptions& immutable_options, const int start_level,
+ const int output_level) {
+ // TODO: currently per_key_placement feature only support level and universal
+ // compaction
+ if (immutable_options.compaction_style != kCompactionStyleLevel &&
+ immutable_options.compaction_style != kCompactionStyleUniversal) {
+ return kInvalidLevel;
+ }
+ if (output_level != immutable_options.num_levels - 1) {
+ return kInvalidLevel;
+ }
+
+ int penultimate_level = output_level - 1;
+ assert(penultimate_level < immutable_options.num_levels);
+ if (penultimate_level <= 0) {
+ return kInvalidLevel;
+ }
+
+ // If the penultimate level is not within input level -> output level range
+ // check if the penultimate output level is empty, if it's empty, it could
+ // also be locked for the penultimate output.
+ // TODO: ideally, it only needs to check if there's a file within the
+ // compaction output key range. For simplicity, it just check if there's any
+ // file on the penultimate level.
+ if (start_level == immutable_options.num_levels - 1 &&
+ (immutable_options.compaction_style != kCompactionStyleUniversal ||
+ !vstorage->LevelFiles(penultimate_level).empty())) {
+ return kInvalidLevel;
+ }
+
+ bool supports_per_key_placement =
+ immutable_options.preclude_last_level_data_seconds > 0;
+
+ // it could be overridden by unittest
+ TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
+ &supports_per_key_placement);
+ if (!supports_per_key_placement) {
+ return kInvalidLevel;
+ }
+
+ return penultimate_level;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..21d1190ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,559 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/sst_partitioner.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+ const InternalKey* smallest = nullptr;
+ const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+ int level;
+ std::vector<FileMetaData*> files;
+ std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+ inline bool empty() const { return files.empty(); }
+ inline size_t size() const { return files.size(); }
+ inline void clear() { files.clear(); }
+ inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+ Compaction(VersionStorageInfo* input_version,
+ const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ std::vector<CompactionInputFiles> inputs, int output_level,
+ uint64_t target_file_size, uint64_t max_compaction_bytes,
+ uint32_t output_path_id, CompressionType compression,
+ CompressionOptions compression_opts,
+ Temperature output_temperature, uint32_t max_subcompactions,
+ std::vector<FileMetaData*> grandparents,
+ bool manual_compaction = false, const std::string& trim_ts = "",
+ double score = -1, bool deletion_compaction = false,
+ bool l0_files_might_overlap = true,
+ CompactionReason compaction_reason = CompactionReason::kUnknown,
+ BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+ BlobGarbageCollectionPolicy::kUseDefault,
+ double blob_garbage_collection_age_cutoff = -1);
+
+ // The type of the penultimate level output range
+ enum class PenultimateOutputRangeType : int {
+ kNotSupported, // it cannot output to the penultimate level
+ kFullRange, // any data could be output to the penultimate level
+ kNonLastRange, // only the keys within non_last_level compaction inputs can
+ // be outputted to the penultimate level
+ kDisabled, // no data can be outputted to the penultimate level
+ };
+
+ // No copying allowed
+ Compaction(const Compaction&) = delete;
+ void operator=(const Compaction&) = delete;
+
+ ~Compaction();
+
+ // Returns the level associated to the specified compaction input level.
+ // If compaction_input_level is not specified, then input_level is set to 0.
+ int level(size_t compaction_input_level = 0) const {
+ return inputs_[compaction_input_level].level;
+ }
+
+ int start_level() const { return start_level_; }
+
+ // Outputs will go to this level
+ int output_level() const { return output_level_; }
+
+ // Returns the number of input levels in this compaction.
+ size_t num_input_levels() const { return inputs_.size(); }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // Returns the number of input files associated to the specified
+ // compaction input level.
+ // The function will return 0 if when "compaction_input_level" < 0
+ // or "compaction_input_level" >= "num_input_levels()".
+ size_t num_input_files(size_t compaction_input_level) const {
+ if (compaction_input_level < inputs_.size()) {
+ return inputs_[compaction_input_level].size();
+ }
+ return 0;
+ }
+
+ // Returns input version of the compaction
+ Version* input_version() const { return input_version_; }
+
+ // Returns the ColumnFamilyData associated with the compaction.
+ ColumnFamilyData* column_family_data() const { return cfd_; }
+
+ // Returns the file meta data of the 'i'th input file at the
+ // specified compaction input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ FileMetaData* input(size_t compaction_input_level, size_t i) const {
+ assert(compaction_input_level < inputs_.size());
+ return inputs_[compaction_input_level][i];
+ }
+
+ const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+ }
+
+ // Returns the list of file meta data of the specified compaction
+ // input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ const std::vector<FileMetaData*>* inputs(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].files;
+ }
+
+ const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+ // Returns the LevelFilesBrief of the specified compaction input level.
+ const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+ return &input_levels_[compaction_input_level];
+ }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+ // Target output file size for this compaction
+ uint64_t target_output_file_size() const { return target_output_file_size_; }
+
+ // What compression for output
+ CompressionType output_compression() const { return output_compression_; }
+
+ // What compression options for output
+ const CompressionOptions& output_compression_opts() const {
+ return output_compression_opts_;
+ }
+
+ // Whether need to write output file to second DB path.
+ uint32_t output_path_id() const { return output_path_id_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // The split user key in the output level if this compaction is required to
+ // split the output files according to the existing cursor in the output
+ // level under round-robin compaction policy. Empty indicates no required
+ // splitting key
+ const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
+
+ // If true, then the compaction can be done by simply deleting input files.
+ bool deletion_compaction() const { return deletion_compaction_; }
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the available information we have guarantees that
+ // the input "user_key" does not exist in any level beyond "output_level()".
+ bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+ std::vector<size_t>* level_ptrs) const;
+
+ // Clear all files to indicate that they are not being compacted
+ // Delete this compaction from the list of running compactions.
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Status status);
+
+ // Returns the summary of the compaction in "output" with maximum "len"
+ // in bytes. The caller is responsible for the memory management of
+ // "output".
+ void Summary(char* output, int len);
+
+ // Return the score that was used to pick this compaction run.
+ double score() const { return score_; }
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level() const { return bottommost_level_; }
+
+ // Is the compaction compact to the last level
+ bool is_last_level() const {
+ return output_level_ == immutable_options_.num_levels - 1;
+ }
+
+ // Does this compaction include all sst files?
+ bool is_full_compaction() const { return is_full_compaction_; }
+
+ // Was this compaction triggered manually by the client?
+ bool is_manual_compaction() const { return is_manual_compaction_; }
+
+ std::string trim_ts() const { return trim_ts_; }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. If all the input files are
+ // non overlapping, then is_trivial_move_ variable
+ // will be set true, else false
+ void set_is_trivial_move(bool trivial_move) {
+ is_trivial_move_ = trivial_move;
+ }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. Returns true, if the input files
+ // are non-overlapping and can be trivially moved.
+ bool is_trivial_move() const { return is_trivial_move_; }
+
+ // How many total levels are there?
+ int number_levels() const { return number_levels_; }
+
+ // Return the ImmutableOptions that should be used throughout the compaction
+ // procedure
+ const ImmutableOptions* immutable_options() const {
+ return &immutable_options_;
+ }
+
+ // Return the MutableCFOptions that should be used throughout the compaction
+ // procedure
+ const MutableCFOptions* mutable_cf_options() const {
+ return &mutable_cf_options_;
+ }
+
+ // Returns the size in bytes that the output file should be preallocated to.
+ // In level compaction, that is max_file_size_. In universal compaction, that
+ // is the sum of all input file sizes.
+ uint64_t OutputFilePreallocationSize() const;
+
+ void SetInputVersion(Version* input_version);
+
+ struct InputLevelSummaryBuffer {
+ char buffer[128];
+ };
+
+ const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+ uint64_t CalculateTotalInputSize() const;
+
+ // In case of compaction error, reset the nextIndex that is used
+ // to pick up the next file to be compacted from files_by_size_
+ void ResetNextCompactionIndex();
+
+ // Create a CompactionFilter from compaction_filter_factory
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+ // Create a SstPartitioner from sst_partitioner_factory
+ std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
+
+ // Is the input level corresponding to output_level_ empty?
+ bool IsOutputLevelEmpty() const;
+
+ // Should this compaction be broken up into smaller ones run in parallel?
+ bool ShouldFormSubcompactions() const;
+
+ // Returns true iff at least one input file references a blob file.
+ //
+ // PRE: input version has been set.
+ bool DoesInputReferenceBlobFiles() const;
+
+ // test function to validate the functionality of IsBottommostLevel()
+ // function -- determines if compaction with inputs and storage is bottommost
+ static bool TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ TablePropertiesCollection GetOutputTableProperties() const {
+ return output_table_properties_;
+ }
+
+ void SetOutputTableProperties(TablePropertiesCollection tp) {
+ output_table_properties_ = std::move(tp);
+ }
+
+ Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+ Slice GetLargestUserKey() const { return largest_user_key_; }
+
+ Slice GetPenultimateLevelSmallestUserKey() const {
+ return penultimate_level_smallest_user_key_;
+ }
+
+ Slice GetPenultimateLevelLargestUserKey() const {
+ return penultimate_level_largest_user_key_;
+ }
+
+ PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
+ return penultimate_output_range_type_;
+ }
+
+ // Return true if the compaction supports per_key_placement
+ bool SupportsPerKeyPlacement() const;
+
+ // Get per_key_placement penultimate output level, which is `last_level - 1`
+ // if per_key_placement feature is supported. Otherwise, return -1.
+ int GetPenultimateLevel() const;
+
+ // Return true if the given range is overlap with penultimate level output
+ // range.
+ // Both smallest_key and largest_key include timestamps if user-defined
+ // timestamp is enabled.
+ bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
+ const Slice& largest_key) const;
+
+ // Return true if the key is within penultimate level output range for
+ // per_key_placement feature, which is safe to place the key to the
+ // penultimate level. different compaction strategy has different rules.
+ // If per_key_placement is not supported, always return false.
+ // TODO: currently it doesn't support moving data from the last level to the
+ // penultimate level
+ // key includes timestamp if user-defined timestamp is enabled.
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const;
+
+ CompactionReason compaction_reason() const { return compaction_reason_; }
+
+ const std::vector<FileMetaData*>& grandparents() const {
+ return grandparents_;
+ }
+
+ uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+ Temperature output_temperature() const { return output_temperature_; }
+
+ uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+ bool enable_blob_garbage_collection() const {
+ return enable_blob_garbage_collection_;
+ }
+
+ double blob_garbage_collection_age_cutoff() const {
+ return blob_garbage_collection_age_cutoff_;
+ }
+
+ // start and end are sub compact range. Null if no boundary.
+ // This is used to filter out some input files' ancester's time range.
+ uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
+ const InternalKey* end) const;
+
+ // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
+ // compaction begin and compaction completion callbacks match.
+ void SetNotifyOnCompactionCompleted() {
+ notify_on_compaction_completion_ = true;
+ }
+
+ bool ShouldNotifyOnCompactionCompleted() const {
+ return notify_on_compaction_completion_;
+ }
+
+ static constexpr int kInvalidLevel = -1;
+
+ // Evaluate penultimate output level. If the compaction supports
+ // per_key_placement feature, it returns the penultimate level number.
+ // Otherwise, it's set to kInvalidLevel (-1), which means
+ // output_to_penultimate_level is not supported.
+ // Note: even the penultimate level output is supported (PenultimateLevel !=
+ // kInvalidLevel), some key range maybe unsafe to be outputted to the
+ // penultimate level. The safe key range is populated by
+ // `PopulatePenultimateLevelOutputRange()`.
+ // Which could potentially disable all penultimate level output.
+ static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
+ const ImmutableOptions& immutable_options,
+ const int start_level,
+ const int output_level);
+
+ private:
+ // mark (or clear) all files that are being compacted
+ void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+ // get the smallest and largest key present in files to be compacted
+ static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs,
+ Slice* smallest_key, Slice* largest_key,
+ int exclude_level = -1);
+
+ // populate penultimate level output range, which will be used to determine if
+ // a key is safe to output to the penultimate level (details see
+ // `Compaction::WithinPenultimateLevelOutputRange()`.
+ void PopulatePenultimateLevelOutputRange();
+
+ // Get the atomic file boundaries for all files in the compaction. Necessary
+ // in order to avoid the scenario described in
+ // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
+ // plumb down appropriate key boundaries to RangeDelAggregator during
+ // compaction.
+ static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+ // helper function to determine if compaction with inputs and storage is
+ // bottommost
+ static bool IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ static bool IsFullCompaction(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ VersionStorageInfo* input_vstorage_;
+
+ const int start_level_; // the lowest level to be compacted
+ const int output_level_; // levels to which output files are stored
+ uint64_t target_output_file_size_;
+ uint64_t max_output_file_size_;
+ uint64_t max_compaction_bytes_;
+ uint32_t max_subcompactions_;
+ const ImmutableOptions immutable_options_;
+ const MutableCFOptions mutable_cf_options_;
+ Version* input_version_;
+ VersionEdit edit_;
+ const int number_levels_;
+ ColumnFamilyData* cfd_;
+ Arena arena_; // Arena used to allocate space for file_levels_
+
+ const uint32_t output_path_id_;
+ CompressionType output_compression_;
+ CompressionOptions output_compression_opts_;
+ Temperature output_temperature_;
+ // If true, then the compaction can be done by simply deleting input files.
+ const bool deletion_compaction_;
+ // should it split the output file using the compact cursor?
+ const InternalKey* output_split_key_;
+
+ // L0 files in LSM-tree might be overlapping. But the compaction picking
+ // logic might pick a subset of the files that aren't overlapping. if
+ // that is the case, set the value to false. Otherwise, set it true.
+ bool l0_files_might_overlap_;
+
+ // Compaction input files organized by level. Constant after construction
+ const std::vector<CompactionInputFiles> inputs_;
+
+ // A copy of inputs_, organized more closely in memory
+ autovector<LevelFilesBrief, 2> input_levels_;
+
+ // State used to check for number of overlapping grandparent files
+ // (grandparent == "output_level_ + 1")
+ std::vector<FileMetaData*> grandparents_;
+ const double score_; // score that was used to pick this compaction.
+
+ // Is this compaction creating a file in the bottom most level?
+ const bool bottommost_level_;
+ // Does this compaction include all sst files?
+ const bool is_full_compaction_;
+
+ // Is this compaction requested by the client?
+ const bool is_manual_compaction_;
+
+ // The data with timestamp > trim_ts_ will be removed
+ const std::string trim_ts_;
+
+ // True if we can do trivial move in Universal multi level
+ // compaction
+ bool is_trivial_move_;
+
+ // Does input compression match the output compression?
+ bool InputCompressionMatchesOutput() const;
+
+ // table properties of output files
+ TablePropertiesCollection output_table_properties_;
+
+ // smallest user keys in compaction
+ // includes timestamp if user-defined timestamp is enabled.
+ Slice smallest_user_key_;
+
+ // largest user keys in compaction
+ // includes timestamp if user-defined timestamp is enabled.
+ Slice largest_user_key_;
+
+ // Reason for compaction
+ CompactionReason compaction_reason_;
+
+ // Notify on compaction completion only if listener was notified on compaction
+ // begin.
+ bool notify_on_compaction_completion_;
+
+ // Enable/disable GC collection for blobs during compaction.
+ bool enable_blob_garbage_collection_;
+
+ // Blob garbage collection age cutoff.
+ double blob_garbage_collection_age_cutoff_;
+
+ // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
+ // means not supported.
+ const int penultimate_level_;
+
+ // Key range for penultimate level output
+ // includes timestamp if user-defined timestamp is enabled.
+ // penultimate_output_range_type_ shows the range type
+ Slice penultimate_level_smallest_user_key_;
+ Slice penultimate_level_largest_user_key_;
+ PenultimateOutputRangeType penultimate_output_range_type_ =
+ PenultimateOutputRangeType::kNotSupported;
+};
+
+#ifndef NDEBUG
+// Helper struct only for tests, which contains the data to decide if a key
+// should be output to the penultimate level.
+// TODO: remove this when the public feature knob is available
+struct PerKeyPlacementContext {
+ const int level;
+ const Slice key;
+ const Slice value;
+ const SequenceNumber seq_num;
+
+ bool output_to_penultimate_level;
+
+ PerKeyPlacementContext(int _level, Slice _key, Slice _value,
+ SequenceNumber _seq_num)
+ : level(_level), key(_key), value(_value), seq_num(_seq_num) {
+ output_to_penultimate_level = false;
+ }
+};
+#endif /* !NDEBUG */
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..1b1c28b57
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct CompactionIterationStats {
+ // Compaction statistics
+
+ // Doesn't include records skipped because of
+ // CompactionFilter::Decision::kRemoveAndSkipUntil.
+ int64_t num_record_drop_user = 0;
+
+ int64_t num_record_drop_hidden = 0;
+ int64_t num_record_drop_obsolete = 0;
+ int64_t num_record_drop_range_del = 0;
+ int64_t num_range_del_drop_obsolete = 0;
+ // Deletions obsoleted before bottom level due to file gap optimization.
+ int64_t num_optimized_del_drop_obsolete = 0;
+ uint64_t total_filter_time = 0;
+
+ // Input statistics
+ // TODO(noetzli): The stats are incomplete. They are lacking everything
+ // consumed by MergeHelper.
+ uint64_t num_input_records = 0;
+ uint64_t num_input_deletion_records = 0;
+ uint64_t num_input_corrupt_records = 0;
+ uint64_t total_input_raw_key_bytes = 0;
+ uint64_t total_input_raw_value_bytes = 0;
+
+ // Single-Delete diagnostics for exceptional situations
+ uint64_t num_single_del_fallthru = 0;
+ uint64_t num_single_del_mismatch = 0;
+
+ // Blob related statistics
+ uint64_t num_blobs_read = 0;
+ uint64_t total_blob_bytes_read = 0;
+ uint64_t num_blobs_relocated = 0;
+ uint64_t total_blob_bytes_relocated = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..9f54f7813
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,1338 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <iterator>
+#include <limits>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/snapshot_checker.h"
+#include "logging/logging.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const Compaction* compaction, const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const std::shared_ptr<Logger> info_log,
+ const std::string* full_history_ts_low,
+ const SequenceNumber preserve_time_min_seqno,
+ const SequenceNumber preclude_last_level_min_seqno)
+ : CompactionIterator(
+ input, cmp, merge_helper, last_sequence, snapshots,
+ earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
+ report_detailed_time, expect_valid_internal_key, range_del_agg,
+ blob_file_builder, allow_data_in_errors, enforce_single_del_contracts,
+ manual_compaction_canceled,
+ std::unique_ptr<CompactionProxy>(
+ compaction ? new RealCompaction(compaction) : nullptr),
+ compaction_filter, shutting_down, info_log, full_history_ts_low,
+ preserve_time_min_seqno, preclude_last_level_min_seqno) {}
+
+CompactionIterator::CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter,
+ const std::atomic<bool>* shutting_down,
+ const std::shared_ptr<Logger> info_log,
+ const std::string* full_history_ts_low,
+ const SequenceNumber preserve_time_min_seqno,
+ const SequenceNumber preclude_last_level_min_seqno)
+ : input_(input, cmp,
+ !compaction || compaction->DoesInputReferenceBlobFiles()),
+ cmp_(cmp),
+ merge_helper_(merge_helper),
+ snapshots_(snapshots),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ job_snapshot_(job_snapshot),
+ snapshot_checker_(snapshot_checker),
+ env_(env),
+ clock_(env_->GetSystemClock().get()),
+ report_detailed_time_(report_detailed_time),
+ expect_valid_internal_key_(expect_valid_internal_key),
+ range_del_agg_(range_del_agg),
+ blob_file_builder_(blob_file_builder),
+ compaction_(std::move(compaction)),
+ compaction_filter_(compaction_filter),
+ shutting_down_(shutting_down),
+ manual_compaction_canceled_(manual_compaction_canceled),
+ bottommost_level_(!compaction_ ? false
+ : compaction_->bottommost_level() &&
+ !compaction_->allow_ingest_behind()),
+ // snapshots_ cannot be nullptr, but we will assert later in the body of
+ // the constructor.
+ visible_at_tip_(snapshots_ ? snapshots_->empty() : false),
+ earliest_snapshot_(!snapshots_ || snapshots_->empty()
+ ? kMaxSequenceNumber
+ : snapshots_->at(0)),
+ info_log_(info_log),
+ allow_data_in_errors_(allow_data_in_errors),
+ enforce_single_del_contracts_(enforce_single_del_contracts),
+ timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0),
+ full_history_ts_low_(full_history_ts_low),
+ current_user_key_sequence_(0),
+ current_user_key_snapshot_(0),
+ merge_out_iter_(merge_helper_),
+ blob_garbage_collection_cutoff_file_number_(
+ ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())),
+ blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())),
+ prefetch_buffers_(
+ CreatePrefetchBufferCollectionIfNeeded(compaction_.get())),
+ current_key_committed_(false),
+ cmp_with_history_ts_low_(0),
+ level_(compaction_ == nullptr ? 0 : compaction_->level()),
+ preserve_time_min_seqno_(preserve_time_min_seqno),
+ preclude_last_level_min_seqno_(preclude_last_level_min_seqno) {
+ assert(snapshots_ != nullptr);
+ assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_);
+
+ if (compaction_ != nullptr) {
+ level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+ }
+#ifndef NDEBUG
+ // findEarliestVisibleSnapshot assumes this ordering.
+ for (size_t i = 1; i < snapshots_->size(); ++i) {
+ assert(snapshots_->at(i - 1) < snapshots_->at(i));
+ }
+ assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
+ timestamp_size_ == full_history_ts_low_->size());
+#endif
+ input_.SetPinnedItersMgr(&pinned_iters_mgr_);
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+ // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
+ input_.SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+ iter_stats_.num_record_drop_user = 0;
+ iter_stats_.num_record_drop_hidden = 0;
+ iter_stats_.num_record_drop_obsolete = 0;
+ iter_stats_.num_record_drop_range_del = 0;
+ iter_stats_.num_range_del_drop_obsolete = 0;
+ iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+ NextFromInput();
+ PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+ // If there is a merge output, return it before continuing to process the
+ // input.
+ if (merge_out_iter_.Valid()) {
+ merge_out_iter_.Next();
+
+ // Check if we returned all records of the merge output.
+ if (merge_out_iter_.Valid()) {
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to be valid.
+ if (!s.ok()) {
+ ROCKS_LOG_FATAL(
+ info_log_, "Invalid ikey %s in compaction. %s",
+ allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+ s.getState());
+ assert(false);
+ }
+
+ // Keep current_key_ in sync.
+ if (0 == timestamp_size_) {
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ } else {
+ Slice ts = ikey_.GetTimestamp(timestamp_size_);
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts);
+ }
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ validity_info_.SetValid(ValidContext::kMerge1);
+ } else {
+ // We consumed all pinned merge operands, release pinned iterators
+ pinned_iters_mgr_.ReleasePinnedData();
+ // MergeHelper moves the iterator to the first record after the merged
+ // records, so even though we reached the end of the merge output, we do
+ // not want to advance the iterator.
+ NextFromInput();
+ }
+ } else {
+ // Only advance the input iterator if there is no merge output and the
+ // iterator is not already at the next record.
+ if (!at_next_) {
+ AdvanceInputIter();
+ }
+ NextFromInput();
+ }
+
+ if (Valid()) {
+ // Record that we've outputted a record for the current key.
+ has_outputted_key_ = true;
+ }
+
+ PrepareOutput();
+}
+
+bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+ Slice* skip_until) {
+ // TODO: support compaction filter for wide-column entities
+ if (!compaction_filter_ ||
+ (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+ return true;
+ }
+ bool error = false;
+ // If the user has specified a compaction filter and the sequence
+ // number is greater than any external snapshot, then invoke the
+ // filter. If the return value of the compaction filter is true,
+ // replace the entry with a deletion marker.
+ CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
+ compaction_filter_value_.clear();
+ compaction_filter_skip_until_.Clear();
+ CompactionFilter::ValueType value_type =
+ ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+ : CompactionFilter::ValueType::kBlobIndex;
+ // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+ // to get sequence number.
+ assert(compaction_filter_);
+ Slice& filter_key =
+ (ikey_.type == kTypeValue ||
+ !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
+ ? ikey_.user_key
+ : key_;
+ {
+ StopWatchNano timer(clock_, report_detailed_time_);
+ if (kTypeBlobIndex == ikey_.type) {
+ filter = compaction_filter_->FilterBlobByKey(
+ level_, filter_key, &compaction_filter_value_,
+ compaction_filter_skip_until_.rep());
+ if (CompactionFilter::Decision::kUndetermined == filter &&
+ !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ if (compaction_ == nullptr) {
+ status_ =
+ Status::Corruption("Unexpected blob index outside of compaction");
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+ &value_);
+
+ // For integrated BlobDB impl, CompactionIterator reads blob value.
+ // For Stacked BlobDB impl, the corresponding CompactionFilter's
+ // FilterV2 method should read the blob value.
+ BlobIndex blob_index;
+ Status s = blob_index.DecodeFrom(value_);
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ FilePrefetchBuffer* prefetch_buffer =
+ prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+ blob_index.file_number())
+ : nullptr;
+
+ uint64_t bytes_read = 0;
+
+ assert(blob_fetcher_);
+
+ s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index,
+ prefetch_buffer, &blob_value_,
+ &bytes_read);
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ ++iter_stats_.num_blobs_read;
+ iter_stats_.total_blob_bytes_read += bytes_read;
+
+ value_type = CompactionFilter::ValueType::kValue;
+ }
+ }
+ if (CompactionFilter::Decision::kUndetermined == filter) {
+ filter = compaction_filter_->FilterV2(
+ level_, filter_key, value_type,
+ blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+ compaction_filter_skip_until_.rep());
+ }
+ iter_stats_.total_filter_time +=
+ env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+ }
+
+ if (CompactionFilter::Decision::kUndetermined == filter) {
+ // Should not reach here, since FilterV2 should never return kUndetermined.
+ status_ =
+ Status::NotSupported("FilterV2() should never return kUndetermined");
+ validity_info_.Invalidate();
+ return false;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+ cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+ 0) {
+ // Can't skip to a key smaller than the current one.
+ // Keep the key as per FilterV2 documentation.
+ filter = CompactionFilter::Decision::kKeep;
+ }
+
+ if (filter == CompactionFilter::Decision::kRemove) {
+ // convert the current key to a delete; key_ is pointing into
+ // current_key_ at this point, so updating current_key_ updates key()
+ ikey_.type = kTypeDeletion;
+ current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+ // no value associated with delete
+ value_.clear();
+ iter_stats_.num_record_drop_user++;
+ } else if (filter == CompactionFilter::Decision::kPurge) {
+ // convert the current key to a single delete; key_ is pointing into
+ // current_key_ at this point, so updating current_key_ updates key()
+ ikey_.type = kTypeSingleDeletion;
+ current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion);
+ // no value associated with single delete
+ value_.clear();
+ iter_stats_.num_record_drop_user++;
+ } else if (filter == CompactionFilter::Decision::kChangeValue) {
+ if (ikey_.type == kTypeBlobIndex) {
+ // value transfer from blob file to inlined data
+ ikey_.type = kTypeValue;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ }
+ value_ = compaction_filter_value_;
+ } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+ *need_skip = true;
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ *skip_until = compaction_filter_skip_until_.Encode();
+ } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+ // Only the StackableDB-based BlobDB impl's compaction filter should return
+ // kChangeBlobIndex. Decision about rewriting blob and changing blob index
+ // in the integrated BlobDB impl is made in subsequent call to
+ // PrepareOutput() and its callees.
+ if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ status_ = Status::NotSupported(
+ "Only stacked BlobDB's internal compaction filter can return "
+ "kChangeBlobIndex.");
+ validity_info_.Invalidate();
+ return false;
+ }
+ if (ikey_.type == kTypeValue) {
+ // value transfer from inlined data to blob file
+ ikey_.type = kTypeBlobIndex;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ }
+ value_ = compaction_filter_value_;
+ } else if (filter == CompactionFilter::Decision::kIOError) {
+ if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ status_ = Status::NotSupported(
+ "CompactionFilter for integrated BlobDB should not return kIOError");
+ validity_info_.Invalidate();
+ return false;
+ }
+ status_ = Status::IOError("Failed to access blob during compaction filter");
+ error = true;
+ }
+ return !error;
+}
+
+void CompactionIterator::NextFromInput() {
+ at_next_ = false;
+ validity_info_.Invalidate();
+
+ while (!Valid() && input_.Valid() && !IsPausingManualCompaction() &&
+ !IsShuttingDown()) {
+ key_ = input_.key();
+ value_ = input_.value();
+ blob_value_.Reset();
+ iter_stats_.num_input_records++;
+
+ Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+ if (!pik_status.ok()) {
+ iter_stats_.num_input_corrupt_records++;
+
+ // If `expect_valid_internal_key_` is false, return the corrupted key
+ // and let the caller decide what to do with it.
+ if (expect_valid_internal_key_) {
+ status_ = pik_status;
+ return;
+ }
+ key_ = current_key_.SetInternalKey(key_);
+ has_current_user_key_ = false;
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ validity_info_.SetValid(ValidContext::kParseKeyError);
+ break;
+ }
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+ // Update input statistics
+ if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
+ ikey_.type == kTypeDeletionWithTimestamp) {
+ iter_stats_.num_input_deletion_records++;
+ }
+ iter_stats_.total_input_raw_key_bytes += key_.size();
+ iter_stats_.total_input_raw_value_bytes += value_.size();
+
+ // If need_skip is true, we should seek the input iterator
+ // to internal key skip_until and continue from there.
+ bool need_skip = false;
+ // Points either into compaction_filter_skip_until_ or into
+ // merge_helper_->compaction_filter_skip_until_.
+ Slice skip_until;
+
+ bool user_key_equal_without_ts = false;
+ int cmp_ts = 0;
+ if (has_current_user_key_) {
+ user_key_equal_without_ts =
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_);
+ // if timestamp_size_ > 0, then curr_ts_ has been initialized by a
+ // previous key.
+ cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp(
+ ExtractTimestampFromUserKey(
+ ikey_.user_key, timestamp_size_),
+ curr_ts_)
+ : 0;
+ }
+
+ // Check whether the user key changed. After this if statement current_key_
+ // is a copy of the current input key (maybe converted to a delete by the
+ // compaction filter). ikey_.user_key is pointing to the copy.
+ if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) {
+ // First occurrence of this user key
+ // Copy key for output
+ key_ = current_key_.SetInternalKey(key_, &ikey_);
+
+ int prev_cmp_with_ts_low =
+ !full_history_ts_low_ ? 0
+ : curr_ts_.empty()
+ ? 0
+ : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_);
+
+ // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use
+ // in next iteration to compare with the timestamp of next key.
+ UpdateTimestampAndCompareWithFullHistoryLow();
+
+ // If
+ // (1) !has_current_user_key_, OR
+ // (2) timestamp is disabled, OR
+ // (3) all history will be preserved, OR
+ // (4) user key (excluding timestamp) is different from previous key, OR
+ // (5) timestamp is NO older than *full_history_ts_low_, OR
+ // (6) timestamp is the largest one older than full_history_ts_low_,
+ // then current_user_key_ must be treated as a different user key.
+ // This means, if a user key (excluding ts) is the same as the previous
+ // user key, and its ts is older than *full_history_ts_low_, then we
+ // consider this key for GC, e.g. it may be dropped if certain conditions
+ // match.
+ if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ ||
+ !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 ||
+ prev_cmp_with_ts_low >= 0) {
+ // Initialize for future comparison for rule (A) and etc.
+ current_user_key_sequence_ = kMaxSequenceNumber;
+ current_user_key_snapshot_ = 0;
+ has_current_user_key_ = true;
+ }
+ current_user_key_ = ikey_.user_key;
+
+ has_outputted_key_ = false;
+
+ last_key_seq_zeroed_ = false;
+
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+ // Apply the compaction filter to the first committed version of the user
+ // key.
+ if (current_key_committed_ &&
+ !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+ break;
+ }
+ } else {
+ // Update the current key to reflect the new sequence number/type without
+ // copying the user key.
+ // TODO(rven): Compaction filter does not process keys in this path
+ // Need to have the compaction filter process multiple versions
+ // if we have versions on both sides of a snapshot
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+
+ // Note that newer version of a key is ordered before older versions. If a
+ // newer version of a key is committed, so as the older version. No need
+ // to query snapshot_checker_ in that case.
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ current_key_committed_ = KeyCommitted(ikey_.sequence);
+ // Apply the compaction filter to the first committed version of the
+ // user key.
+ if (current_key_committed_ &&
+ !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+ break;
+ }
+ }
+ }
+
+ if (UNLIKELY(!current_key_committed_)) {
+ assert(snapshot_checker_ != nullptr);
+ validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted);
+ break;
+ }
+
+ // If there are no snapshots, then this kv affect visibility at tip.
+ // Otherwise, search though all existing snapshots to find the earliest
+ // snapshot that is affected by this kv.
+ SequenceNumber last_sequence = current_user_key_sequence_;
+ current_user_key_sequence_ = ikey_.sequence;
+ SequenceNumber last_snapshot = current_user_key_snapshot_;
+ SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
+ current_user_key_snapshot_ =
+ visible_at_tip_
+ ? earliest_snapshot_
+ : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+ if (need_skip) {
+ // This case is handled below.
+ } else if (clear_and_output_next_key_) {
+ // In the previous iteration we encountered a single delete that we could
+ // not compact out. We will keep this Put, but can drop it's data.
+ // (See Optimization 3, below.)
+ if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
+ ikey_.type != kTypeWideColumnEntity) {
+ ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str());
+ assert(false);
+ }
+ if (current_user_key_snapshot_ < last_snapshot) {
+ ROCKS_LOG_FATAL(info_log_,
+ "key %s, current_user_key_snapshot_ (%" PRIu64
+ ") < last_snapshot (%" PRIu64 ")",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+ current_user_key_snapshot_, last_snapshot);
+ assert(false);
+ }
+
+ if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) {
+ ikey_.type = kTypeValue;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ }
+
+ value_.clear();
+ validity_info_.SetValid(ValidContext::kKeepSDAndClearPut);
+ clear_and_output_next_key_ = false;
+ } else if (ikey_.type == kTypeSingleDeletion) {
+ // We can compact out a SingleDelete if:
+ // 1) We encounter the corresponding PUT -OR- we know that this key
+ // doesn't appear past this output level
+ // =AND=
+ // 2) We've already returned a record in this snapshot -OR-
+ // there are no earlier earliest_write_conflict_snapshot.
+ //
+ // A note about 2) above:
+ // we try to determine whether there is any earlier write conflict
+ // checking snapshot by calling DefinitelyInSnapshot() with seq and
+ // earliest_write_conflict_snapshot as arguments. For write-prepared
+ // and write-unprepared transactions, if earliest_write_conflict_snapshot
+ // is evicted from WritePreparedTxnDB::commit_cache, then
+ // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns
+ // false, even if the seq is actually visible within
+ // earliest_write_conflict_snapshot. Consequently, CompactionIterator
+ // may try to zero out its sequence number, thus hitting assertion error
+ // in debug mode or cause incorrect DBIter return result.
+ // We observe that earliest_write_conflict_snapshot >= earliest_snapshot,
+ // and the seq zeroing logic depends on
+ // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot
+ // determine whether seq is **definitely** in
+ // earliest_write_conflict_snapshot, then we can additionally check if
+ // seq is definitely in earliest_snapshot. If the latter holds, then the
+ // former holds too.
+ //
+ // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to
+ // allow Transactions to do write-conflict checking (if we compacted away
+ // all keys, then we wouldn't know that a write happened in this
+ // snapshot). If there is no earlier snapshot, then we know that there
+ // are no active transactions that need to know about any writes.
+ //
+ // Optimization 3:
+ // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+ // true, then we must output a SingleDelete. In this case, we will decide
+ // to also output the PUT. While we are compacting less by outputting the
+ // PUT now, hopefully this will lead to better compaction in the future
+ // when Rule 2 is later true (Ie, We are hoping we can later compact out
+ // both the SingleDelete and the Put, while we couldn't if we only
+ // outputted the SingleDelete now).
+ // In this case, we can save space by removing the PUT's value as it will
+ // never be read.
+ //
+ // Deletes and Merges are not supported on the same key that has a
+ // SingleDelete as it is not possible to correctly do any partial
+ // compaction of such a combination of operations. The result of mixing
+ // those operations for a given key is documented as being undefined. So
+ // we can choose how to handle such a combinations of operations. We will
+ // try to compact out as much as we can in these cases.
+ // We will report counts on these anomalous cases.
+ //
+ // Note: If timestamp is enabled, then record will be eligible for
+ // deletion, only if, along with above conditions (Rule 1 and Rule 2)
+ // full_history_ts_low_ is specified and timestamp for that key is less
+ // than *full_history_ts_low_. If it's not eligible for deletion, then we
+ // will output the SingleDelete. For Optimization 3 also, if
+ // full_history_ts_low_ is specified and timestamp for the key is less
+ // than *full_history_ts_low_ then only optimization will be applied.
+
+ // The easiest way to process a SingleDelete during iteration is to peek
+ // ahead at the next key.
+ const bool is_timestamp_eligible_for_gc =
+ (timestamp_size_ == 0 ||
+ (full_history_ts_low_ && cmp_with_history_ts_low_ < 0));
+
+ ParsedInternalKey next_ikey;
+ AdvanceInputIter();
+
+ // Check whether the next key exists, is not corrupt, and is the same key
+ // as the single delete.
+ if (input_.Valid() &&
+ ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+ .ok() &&
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+#ifndef NDEBUG
+ const Compaction* c =
+ compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:SingleDelete:1",
+ const_cast<Compaction*>(c));
+ if (last_key_seq_zeroed_) {
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ assert(bottommost_level_);
+ AdvanceInputIter();
+ } else if (prev_snapshot == 0 ||
+ DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) {
+ // Check whether the next key belongs to the same snapshot as the
+ // SingleDelete.
+
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:SingleDelete:2", nullptr);
+ if (next_ikey.type == kTypeSingleDeletion) {
+ // We encountered two SingleDeletes for same key in a row. This
+ // could be due to unexpected user input. If write-(un)prepared
+ // transaction is used, this could also be due to releasing an old
+ // snapshot between a Put and its matching SingleDelete.
+ // Skip the first SingleDelete and let the next iteration decide
+ // how to handle the second SingleDelete.
+
+ // First SingleDelete has been skipped since we already called
+ // input_.Next().
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_mismatch;
+ } else if (next_ikey.type == kTypeDeletion) {
+ std::ostringstream oss;
+ oss << "Found SD and type: " << static_cast<int>(next_ikey.type)
+ << " on the same key, violating the contract "
+ "of SingleDelete. Check your application to make sure the "
+ "application does not mix SingleDelete and Delete for "
+ "the same key. If you are using "
+ "write-prepared/write-unprepared transactions, and use "
+ "SingleDelete to delete certain keys, then make sure "
+ "TransactionDBOptions::rollback_deletion_type_callback is "
+ "configured properly. Mixing SD and DEL can lead to "
+ "undefined behaviors";
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_mismatch;
+ if (enforce_single_del_contracts_) {
+ ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str());
+ validity_info_.Invalidate();
+ status_ = Status::Corruption(oss.str());
+ return;
+ }
+ ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str());
+ } else if (!is_timestamp_eligible_for_gc) {
+ // We cannot drop the SingleDelete as timestamp is enabled, and
+ // timestamp of this key is greater than or equal to
+ // *full_history_ts_low_. We will output the SingleDelete.
+ validity_info_.SetValid(ValidContext::kKeepTsHistory);
+ } else if (has_outputted_key_ ||
+ DefinitelyInSnapshot(ikey_.sequence,
+ earliest_write_conflict_snapshot_) ||
+ (earliest_snapshot_ < earliest_write_conflict_snapshot_ &&
+ DefinitelyInSnapshot(ikey_.sequence,
+ earliest_snapshot_))) {
+ // Found a matching value, we can drop the single delete and the
+ // value. It is safe to drop both records since we've already
+ // outputted a key in this snapshot, or there is no earlier
+ // snapshot (Rule 2 above).
+
+ // Note: it doesn't matter whether the second key is a Put or if it
+ // is an unexpected Merge or Delete. We will compact it out
+ // either way. We will maintain counts of how many mismatches
+ // happened
+ if (next_ikey.type != kTypeValue &&
+ next_ikey.type != kTypeBlobIndex &&
+ next_ikey.type != kTypeWideColumnEntity) {
+ ++iter_stats_.num_single_del_mismatch;
+ }
+
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ // Already called input_.Next() once. Call it a second time to
+ // skip past the second key.
+ AdvanceInputIter();
+ } else {
+ // Found a matching value, but we cannot drop both keys since
+ // there is an earlier snapshot and we need to leave behind a record
+ // to know that a write happened in this snapshot (Rule 2 above).
+ // Clear the value and output the SingleDelete. (The value will be
+ // outputted on the next iteration.)
+
+ // Setting valid_ to true will output the current SingleDelete
+ validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck);
+
+ // Set up the Put to be outputted in the next iteration.
+ // (Optimization 3).
+ clear_and_output_next_key_ = true;
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:KeepSDForWW",
+ /*arg=*/nullptr);
+ }
+ } else {
+ // We hit the next snapshot without hitting a put, so the iterator
+ // returns the single delete.
+ validity_info_.SetValid(ValidContext::kKeepSDForSnapshot);
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:SingleDelete:3",
+ const_cast<Compaction*>(c));
+ }
+ } else {
+ // We are at the end of the input, could not parse the next key, or hit
+ // a different key. The iterator returns the single delete if the key
+ // possibly exists beyond the current output level. We set
+ // has_current_user_key to false so that if the iterator is at the next
+ // key, we do not compare it again against the previous key at the next
+ // iteration. If the next key is corrupt, we return before the
+ // comparison, so the value of has_current_user_key does not matter.
+ has_current_user_key_ = false;
+ if (compaction_ != nullptr &&
+ DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_) &&
+ is_timestamp_eligible_for_gc) {
+ // Key doesn't exist outside of this range.
+ // Can compact out this SingleDelete.
+ ++iter_stats_.num_record_drop_obsolete;
+ ++iter_stats_.num_single_del_fallthru;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ } else if (last_key_seq_zeroed_) {
+ // Skip.
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_obsolete;
+ assert(bottommost_level_);
+ } else {
+ // Output SingleDelete
+ validity_info_.SetValid(ValidContext::kKeepSD);
+ }
+ }
+
+ if (Valid()) {
+ at_next_ = true;
+ }
+ } else if (last_snapshot == current_user_key_snapshot_ ||
+ (last_snapshot > 0 &&
+ last_snapshot < current_user_key_snapshot_)) {
+ // If the earliest snapshot is which this key is visible in
+ // is the same as the visibility of a previous instance of the
+ // same key, then this kv is not visible in any snapshot.
+ // Hidden by an newer entry for same user key
+ //
+ // Note: Dropping this key will not affect TransactionDB write-conflict
+ // checking since there has already been a record returned for this key
+ // in this snapshot.
+ if (last_sequence < current_user_key_sequence_) {
+ ROCKS_LOG_FATAL(info_log_,
+ "key %s, last_sequence (%" PRIu64
+ ") < current_user_key_sequence_ (%" PRIu64 ")",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+ last_sequence, current_user_key_sequence_);
+ assert(false);
+ }
+
+ ++iter_stats_.num_record_drop_hidden; // rule (A)
+ AdvanceInputIter();
+ } else if (compaction_ != nullptr &&
+ (ikey_.type == kTypeDeletion ||
+ (ikey_.type == kTypeDeletionWithTimestamp &&
+ cmp_with_history_ts_low_ < 0)) &&
+ DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+ compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+ &level_ptrs_)) {
+ // TODO(noetzli): This is the only place where we use compaction_
+ // (besides the constructor). We should probably get rid of this
+ // dependency and find a way to do similar filtering during flushes.
+ //
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ //
+ // Note: Dropping this Delete will not affect TransactionDB
+ // write-conflict checking since it is earlier than any snapshot.
+ //
+ // It seems that we can also drop deletion later than earliest snapshot
+ // given that:
+ // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+ // (2) No value exist earlier than the deletion.
+ //
+ // Note also that a deletion marker of type kTypeDeletionWithTimestamp
+ // will be treated as a different user key unless the timestamp is older
+ // than *full_history_ts_low_.
+ ++iter_stats_.num_record_drop_obsolete;
+ if (!bottommost_level_) {
+ ++iter_stats_.num_optimized_del_drop_obsolete;
+ }
+ AdvanceInputIter();
+ } else if ((ikey_.type == kTypeDeletion ||
+ (ikey_.type == kTypeDeletionWithTimestamp &&
+ cmp_with_history_ts_low_ < 0)) &&
+ bottommost_level_) {
+ // Handle the case where we have a delete key at the bottom most level
+ // We can skip outputting the key iff there are no subsequent puts for
+ // this key
+ assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
+ ikey_.user_key, &level_ptrs_));
+ ParsedInternalKey next_ikey;
+ AdvanceInputIter();
+#ifndef NDEBUG
+ const Compaction* c =
+ compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::NextFromInput:BottommostDelete:1",
+ const_cast<Compaction*>(c));
+ // Skip over all versions of this key that happen to occur in the same
+ // snapshot range as the delete.
+ //
+ // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
+ // considered to have a different user key unless the timestamp is older
+ // than *full_history_ts_low_.
+ while (!IsPausingManualCompaction() && !IsShuttingDown() &&
+ input_.Valid() &&
+ (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+ .ok()) &&
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
+ (prev_snapshot == 0 ||
+ DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
+ AdvanceInputIter();
+ }
+ // If you find you still need to output a row with this key, we need to
+ // output the delete too
+ if (input_.Valid() &&
+ (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+ .ok()) &&
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+ validity_info_.SetValid(ValidContext::kKeepDel);
+ at_next_ = true;
+ }
+ } else if (ikey_.type == kTypeMerge) {
+ if (!merge_helper_->HasOperator()) {
+ status_ = Status::InvalidArgument(
+ "merge_operator is not properly initialized.");
+ return;
+ }
+
+ pinned_iters_mgr_.StartPinning();
+
+ // We know the merge type entry is not hidden, otherwise we would
+ // have hit (A)
+ // We encapsulate the merge related state machine in a different
+ // object to minimize change to the existing flow.
+ Status s = merge_helper_->MergeUntil(
+ &input_, range_del_agg_, prev_snapshot, bottommost_level_,
+ allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
+ prefetch_buffers_.get(), &iter_stats_);
+ merge_out_iter_.SeekToFirst();
+
+ if (!s.ok() && !s.IsMergeInProgress()) {
+ status_ = s;
+ return;
+ } else if (merge_out_iter_.Valid()) {
+ // NOTE: key, value, and ikey_ refer to old entries.
+ // These will be correctly set below.
+ key_ = merge_out_iter_.key();
+ value_ = merge_out_iter_.value();
+ pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+ // MergeUntil stops when it encounters a corrupt key and does not
+ // include them in the result, so we expect the keys here to valid.
+ if (!pik_status.ok()) {
+ ROCKS_LOG_FATAL(
+ info_log_, "Invalid key %s in compaction. %s",
+ allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+ pik_status.getState());
+ assert(false);
+ }
+ // Keep current_key_ in sync.
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+ key_ = current_key_.GetInternalKey();
+ ikey_.user_key = current_key_.GetUserKey();
+ validity_info_.SetValid(ValidContext::kMerge2);
+ } else {
+ // all merge operands were filtered out. reset the user key, since the
+ // batch consumed by the merge operator should not shadow any keys
+ // coming after the merges
+ has_current_user_key_ = false;
+ pinned_iters_mgr_.ReleasePinnedData();
+
+ if (merge_helper_->FilteredUntil(&skip_until)) {
+ need_skip = true;
+ }
+ }
+ } else {
+ // 1. new user key -OR-
+ // 2. different snapshot stripe
+ // If user-defined timestamp is enabled, we consider keys for GC if they
+ // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete()
+ // only considers range deletions that are at or below history_ts_low_ and
+ // trim_ts_. We drop keys here that are below history_ts_low_ and are
+ // covered by a range tombstone that is at or below history_ts_low_ and
+ // trim_ts.
+ bool should_delete = false;
+ if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) {
+ should_delete = range_del_agg_->ShouldDelete(
+ key_, RangeDelPositioningMode::kForwardTraversal);
+ }
+ if (should_delete) {
+ ++iter_stats_.num_record_drop_hidden;
+ ++iter_stats_.num_record_drop_range_del;
+ AdvanceInputIter();
+ } else {
+ validity_info_.SetValid(ValidContext::kNewUserKey);
+ }
+ }
+
+ if (need_skip) {
+ SkipUntil(skip_until);
+ }
+ }
+
+ if (!Valid() && IsShuttingDown()) {
+ status_ = Status::ShutdownInProgress();
+ }
+
+ if (IsPausingManualCompaction()) {
+ status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ // Propagate corruption status from memtable itereator
+ if (!input_.Valid() && input_.status().IsCorruption()) {
+ status_ = input_.status();
+ }
+}
+
+bool CompactionIterator::ExtractLargeValueIfNeededImpl() {
+ if (!blob_file_builder_) {
+ return false;
+ }
+
+ blob_index_.clear();
+ const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_);
+
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+
+ return false;
+ }
+
+ if (blob_index_.empty()) {
+ return false;
+ }
+
+ value_ = blob_index_;
+
+ return true;
+}
+
+void CompactionIterator::ExtractLargeValueIfNeeded() {
+ assert(ikey_.type == kTypeValue);
+
+ if (!ExtractLargeValueIfNeededImpl()) {
+ return;
+ }
+
+ ikey_.type = kTypeBlobIndex;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+}
+
+void CompactionIterator::GarbageCollectBlobIfNeeded() {
+ assert(ikey_.type == kTypeBlobIndex);
+
+ if (!compaction_) {
+ return;
+ }
+
+ // GC for integrated BlobDB
+ if (compaction_->enable_blob_garbage_collection()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+ &value_);
+
+ BlobIndex blob_index;
+
+ {
+ const Status s = blob_index.DecodeFrom(value_);
+
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+
+ return;
+ }
+ }
+
+ if (blob_index.file_number() >=
+ blob_garbage_collection_cutoff_file_number_) {
+ return;
+ }
+
+ FilePrefetchBuffer* prefetch_buffer =
+ prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+ blob_index.file_number())
+ : nullptr;
+
+ uint64_t bytes_read = 0;
+
+ {
+ assert(blob_fetcher_);
+
+ const Status s = blob_fetcher_->FetchBlob(
+ user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read);
+
+ if (!s.ok()) {
+ status_ = s;
+ validity_info_.Invalidate();
+
+ return;
+ }
+ }
+
+ ++iter_stats_.num_blobs_read;
+ iter_stats_.total_blob_bytes_read += bytes_read;
+
+ ++iter_stats_.num_blobs_relocated;
+ iter_stats_.total_blob_bytes_relocated += blob_index.size();
+
+ value_ = blob_value_;
+
+ if (ExtractLargeValueIfNeededImpl()) {
+ return;
+ }
+
+ ikey_.type = kTypeValue;
+ current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+
+ return;
+ }
+
+ // GC for stacked BlobDB
+ if (compaction_filter_ &&
+ compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+ const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+ user_key(), value_, &compaction_filter_value_);
+
+ if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+ status_ =
+ Status::Corruption("Corrupted blob reference encountered during GC");
+ validity_info_.Invalidate();
+
+ return;
+ }
+
+ if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+ status_ = Status::IOError("Could not relocate blob during GC");
+ validity_info_.Invalidate();
+
+ return;
+ }
+
+ if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) {
+ value_ = compaction_filter_value_;
+
+ return;
+ }
+ }
+}
+
+void CompactionIterator::DecideOutputLevel() {
+ assert(compaction_->SupportsPerKeyPlacement());
+#ifndef NDEBUG
+ // Could be overridden by unittest
+ PerKeyPlacementContext context(level_, ikey_.user_key, value_,
+ ikey_.sequence);
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
+ &context);
+ output_to_penultimate_level_ = context.output_to_penultimate_level;
+#else
+ output_to_penultimate_level_ = false;
+#endif // NDEBUG
+
+ // if the key is newer than the cutoff sequence or within the earliest
+ // snapshot, it should output to the penultimate level.
+ if (ikey_.sequence > preclude_last_level_min_seqno_ ||
+ ikey_.sequence > earliest_snapshot_) {
+ output_to_penultimate_level_ = true;
+ }
+
+ if (output_to_penultimate_level_) {
+ // If it's decided to output to the penultimate level, but unsafe to do so,
+ // still output to the last level. For example, moving the data from a lower
+ // level to a higher level outside of the higher-level input key range is
+ // considered unsafe, because the key may conflict with higher-level SSTs
+ // not from this compaction.
+ // TODO: add statistic for declined output_to_penultimate_level
+ bool safe_to_penultimate_level =
+ compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
+ if (!safe_to_penultimate_level) {
+ output_to_penultimate_level_ = false;
+ // It could happen when disable/enable `last_level_temperature` while
+ // holding a snapshot. When `last_level_temperature` is not set
+ // (==kUnknown), the data newer than any snapshot is pushed to the last
+ // level, but when the per_key_placement feature is enabled on the fly,
+ // the data later than the snapshot has to be moved to the penultimate
+ // level, which may or may not be safe. So the user needs to make sure all
+ // snapshot is released before enabling `last_level_temperature` feature
+ // We will migrate the feature to `last_level_temperature` and maybe make
+ // it not dynamically changeable.
+ if (ikey_.sequence > earliest_snapshot_) {
+ status_ = Status::Corruption(
+ "Unsafe to store Seq later than snapshot in the last level if "
+ "per_key_placement is enabled");
+ }
+ }
+ }
+}
+
+void CompactionIterator::PrepareOutput() {
+ if (Valid()) {
+ if (ikey_.type == kTypeValue) {
+ ExtractLargeValueIfNeeded();
+ } else if (ikey_.type == kTypeBlobIndex) {
+ GarbageCollectBlobIfNeeded();
+ }
+
+ if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
+ DecideOutputLevel();
+ }
+
+ // Zeroing out the sequence number leads to better compression.
+ // If this is the bottommost level (no files in lower levels)
+ // and the earliest snapshot is larger than this seqno
+ // and the userkey differs from the last userkey in compaction
+ // then we can squash the seqno to zero.
+ //
+ // This is safe for TransactionDB write-conflict checking since transactions
+ // only care about sequence number larger than any active snapshots.
+ //
+ // Can we do the same for levels above bottom level as long as
+ // KeyNotExistsBeyondOutputLevel() return true?
+ if (Valid() && compaction_ != nullptr &&
+ !compaction_->allow_ingest_behind() && bottommost_level_ &&
+ DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+ ikey_.type != kTypeMerge && current_key_committed_ &&
+ !output_to_penultimate_level_ &&
+ ikey_.sequence < preserve_time_min_seqno_) {
+ if (ikey_.type == kTypeDeletion ||
+ (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
+ ROCKS_LOG_FATAL(
+ info_log_,
+ "Unexpected key %s for seq-zero optimization. "
+ "earliest_snapshot %" PRIu64
+ ", earliest_write_conflict_snapshot %" PRIu64
+ " job_snapshot %" PRIu64
+ ". timestamp_size: %d full_history_ts_low_ %s. validity %x",
+ ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+ earliest_snapshot_, earliest_write_conflict_snapshot_,
+ job_snapshot_, static_cast<int>(timestamp_size_),
+ full_history_ts_low_ != nullptr
+ ? Slice(*full_history_ts_low_).ToString(true).c_str()
+ : "null",
+ validity_info_.rep);
+ assert(false);
+ }
+ ikey_.sequence = 0;
+ last_key_seq_zeroed_ = true;
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+ &ikey_);
+ if (!timestamp_size_) {
+ current_key_.UpdateInternalKey(0, ikey_.type);
+ } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
+ // We can also zero out timestamp for better compression.
+ // For the same user key (excluding timestamp), the timestamp-based
+ // history can be collapsed to save some space if the timestamp is
+ // older than *full_history_ts_low_.
+ const std::string kTsMin(timestamp_size_, static_cast<char>(0));
+ const Slice ts_slice = kTsMin;
+ ikey_.SetTimestamp(ts_slice);
+ current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+ }
+ }
+ }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot) {
+ assert(snapshots_->size());
+ if (snapshots_->size() == 0) {
+ ROCKS_LOG_FATAL(info_log_,
+ "No snapshot left in findEarliestVisibleSnapshot");
+ }
+ auto snapshots_iter =
+ std::lower_bound(snapshots_->begin(), snapshots_->end(), in);
+ assert(prev_snapshot != nullptr);
+ if (snapshots_iter == snapshots_->begin()) {
+ *prev_snapshot = 0;
+ } else {
+ *prev_snapshot = *std::prev(snapshots_iter);
+ if (*prev_snapshot >= in) {
+ ROCKS_LOG_FATAL(info_log_,
+ "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64
+ ") in findEarliestVisibleSnapshot",
+ *prev_snapshot, in);
+ assert(false);
+ }
+ }
+ if (snapshot_checker_ == nullptr) {
+ return snapshots_iter != snapshots_->end() ? *snapshots_iter
+ : kMaxSequenceNumber;
+ }
+ bool has_released_snapshot = !released_snapshots_.empty();
+ for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+ auto cur = *snapshots_iter;
+ if (in > cur) {
+ ROCKS_LOG_FATAL(info_log_,
+ "in (%" PRIu64 ") > cur (%" PRIu64
+ ") in findEarliestVisibleSnapshot",
+ in, cur);
+ assert(false);
+ }
+ // Skip if cur is in released_snapshots.
+ if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+ continue;
+ }
+ auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+ if (res == SnapshotCheckerResult::kInSnapshot) {
+ return cur;
+ } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+ released_snapshots_.insert(cur);
+ }
+ *prev_snapshot = cur;
+ }
+ return kMaxSequenceNumber;
+}
+
+uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
+ const CompactionProxy* compaction) {
+ if (!compaction) {
+ return 0;
+ }
+
+ if (!compaction->enable_blob_garbage_collection()) {
+ return 0;
+ }
+
+ const Version* const version = compaction->input_version();
+ assert(version);
+
+ const VersionStorageInfo* const storage_info = version->storage_info();
+ assert(storage_info);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+
+ const size_t cutoff_index = static_cast<size_t>(
+ compaction->blob_garbage_collection_age_cutoff() * blob_files.size());
+
+ if (cutoff_index >= blob_files.size()) {
+ return std::numeric_limits<uint64_t>::max();
+ }
+
+ const auto& meta = blob_files[cutoff_index];
+ assert(meta);
+
+ return meta->GetBlobFileNumber();
+}
+
+std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
+ const CompactionProxy* compaction) {
+ if (!compaction) {
+ return nullptr;
+ }
+
+ const Version* const version = compaction->input_version();
+ if (!version) {
+ return nullptr;
+ }
+
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+
+ return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));
+}
+
+std::unique_ptr<PrefetchBufferCollection>
+CompactionIterator::CreatePrefetchBufferCollectionIfNeeded(
+ const CompactionProxy* compaction) {
+ if (!compaction) {
+ return nullptr;
+ }
+
+ if (!compaction->input_version()) {
+ return nullptr;
+ }
+
+ if (compaction->allow_mmap_reads()) {
+ return nullptr;
+ }
+
+ const uint64_t readahead_size = compaction->blob_compaction_readahead_size();
+ if (!readahead_size) {
+ return nullptr;
+ }
+
+ return std::unique_ptr<PrefetchBufferCollection>(
+ new PrefetchBufferCollection(readahead_size));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..c215d2bbb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,513 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileBuilder;
+class BlobFetcher;
+class PrefetchBufferCollection;
+
+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+ SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+ bool need_count_entries)
+ : icmp_(cmp),
+ inner_iter_(iter),
+ need_count_entries_(need_count_entries) {}
+ bool Valid() const override { return inner_iter_->Valid(); }
+ Status status() const override { return inner_iter_->status(); }
+ void Next() override {
+ num_itered_++;
+ inner_iter_->Next();
+ }
+ void Seek(const Slice& target) override {
+ if (!need_count_entries_) {
+ inner_iter_->Seek(target);
+ } else {
+ // For flush cases, we need to count total number of entries, so we
+ // do Next() rather than Seek().
+ while (inner_iter_->Valid() &&
+ icmp_.Compare(inner_iter_->key(), target) < 0) {
+ Next();
+ }
+ }
+ }
+ Slice key() const override { return inner_iter_->key(); }
+ Slice value() const override { return inner_iter_->value(); }
+
+ // Unused InternalIterator methods
+ void SeekToFirst() override { assert(false); }
+ void Prev() override { assert(false); }
+ void SeekForPrev(const Slice& /* target */) override { assert(false); }
+ void SeekToLast() override { assert(false); }
+
+ uint64_t num_itered() const { return num_itered_; }
+
+ private:
+ InternalKeyComparator icmp_;
+ InternalIterator* inner_iter_; // not owned
+ uint64_t num_itered_ = 0;
+ bool need_count_entries_;
+};
+
+class CompactionIterator {
+ public:
+ // A wrapper around Compaction. Has a much smaller interface, only what
+ // CompactionIterator uses. Tests can override it.
+ class CompactionProxy {
+ public:
+ virtual ~CompactionProxy() = default;
+
+ virtual int level() const = 0;
+
+ virtual bool KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0;
+
+ virtual bool bottommost_level() const = 0;
+
+ virtual int number_levels() const = 0;
+
+ // Result includes timestamp if user-defined timestamp is enabled.
+ virtual Slice GetLargestUserKey() const = 0;
+
+ virtual bool allow_ingest_behind() const = 0;
+
+ virtual bool allow_mmap_reads() const = 0;
+
+ virtual bool enable_blob_garbage_collection() const = 0;
+
+ virtual double blob_garbage_collection_age_cutoff() const = 0;
+
+ virtual uint64_t blob_compaction_readahead_size() const = 0;
+
+ virtual const Version* input_version() const = 0;
+
+ virtual bool DoesInputReferenceBlobFiles() const = 0;
+
+ virtual const Compaction* real_compaction() const = 0;
+
+ virtual bool SupportsPerKeyPlacement() const = 0;
+
+ // `key` includes timestamp if user-defined timestamp is enabled.
+ virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
+ };
+
+ class RealCompaction : public CompactionProxy {
+ public:
+ explicit RealCompaction(const Compaction* compaction)
+ : compaction_(compaction) {
+ assert(compaction_);
+ assert(compaction_->immutable_options());
+ assert(compaction_->mutable_cf_options());
+ }
+
+ int level() const override { return compaction_->level(); }
+
+ bool KeyNotExistsBeyondOutputLevel(
+ const Slice& user_key, std::vector<size_t>* level_ptrs) const override {
+ return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+ }
+
+ bool bottommost_level() const override {
+ return compaction_->bottommost_level();
+ }
+
+ int number_levels() const override { return compaction_->number_levels(); }
+
+ // Result includes timestamp if user-defined timestamp is enabled.
+ Slice GetLargestUserKey() const override {
+ return compaction_->GetLargestUserKey();
+ }
+
+ bool allow_ingest_behind() const override {
+ return compaction_->immutable_options()->allow_ingest_behind;
+ }
+
+ bool allow_mmap_reads() const override {
+ return compaction_->immutable_options()->allow_mmap_reads;
+ }
+
+ bool enable_blob_garbage_collection() const override {
+ return compaction_->enable_blob_garbage_collection();
+ }
+
+ double blob_garbage_collection_age_cutoff() const override {
+ return compaction_->blob_garbage_collection_age_cutoff();
+ }
+
+ uint64_t blob_compaction_readahead_size() const override {
+ return compaction_->mutable_cf_options()->blob_compaction_readahead_size;
+ }
+
+ const Version* input_version() const override {
+ return compaction_->input_version();
+ }
+
+ bool DoesInputReferenceBlobFiles() const override {
+ return compaction_->DoesInputReferenceBlobFiles();
+ }
+
+ const Compaction* real_compaction() const override { return compaction_; }
+
+ bool SupportsPerKeyPlacement() const override {
+ return compaction_->SupportsPerKeyPlacement();
+ }
+
+ // Check if key is within penultimate level output range, to see if it's
+ // safe to output to the penultimate level for per_key_placement feature.
+ // `key` includes timestamp if user-defined timestamp is enabled.
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+ return compaction_->WithinPenultimateLevelOutputRange(key);
+ }
+
+ private:
+ const Compaction* compaction_;
+ };
+
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const Compaction* compaction = nullptr,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr,
+ const std::string* full_history_ts_low = nullptr,
+ const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+ const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+ // Constructor with custom CompactionProxy, used for tests.
+ CompactionIterator(
+ InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+ SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+ Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+ CompactionRangeDelAggregator* range_del_agg,
+ BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+ bool enforce_single_del_contracts,
+ const std::atomic<bool>& manual_compaction_canceled,
+ std::unique_ptr<CompactionProxy> compaction,
+ const CompactionFilter* compaction_filter = nullptr,
+ const std::atomic<bool>* shutting_down = nullptr,
+ const std::shared_ptr<Logger> info_log = nullptr,
+ const std::string* full_history_ts_low = nullptr,
+ const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+ const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+ ~CompactionIterator();
+
+ void ResetRecordCounts();
+
+ // Seek to the beginning of the compaction iterator output.
+ //
+ // REQUIRED: Call only once.
+ void SeekToFirst();
+
+ // Produces the next record in the compaction.
+ //
+ // REQUIRED: SeekToFirst() has been called.
+ void Next();
+
+ // Getters
+ const Slice& key() const { return key_; }
+ const Slice& value() const { return value_; }
+ const Status& status() const { return status_; }
+ const ParsedInternalKey& ikey() const { return ikey_; }
+ inline bool Valid() const { return validity_info_.IsValid(); }
+ const Slice& user_key() const { return current_user_key_; }
+ const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+ uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
+ // If the current key should be placed on penultimate level, only valid if
+ // per_key_placement is supported
+ bool output_to_penultimate_level() const {
+ return output_to_penultimate_level_;
+ }
+ Status InputStatus() const { return input_.status(); }
+
+ private:
+ // Processes the input stream to find the next output
+ void NextFromInput();
+
+ // Do final preparations before presenting the output to the callee.
+ void PrepareOutput();
+
+ // Decide the current key should be output to the last level or penultimate
+ // level, only call for compaction supports per key placement
+ void DecideOutputLevel();
+
+ // Passes the output value to the blob file builder (if any), and replaces it
+ // with the corresponding blob reference if it has been actually written to a
+ // blob file (i.e. if it passed the value size check). Returns true if the
+ // value got extracted to a blob file, false otherwise.
+ bool ExtractLargeValueIfNeededImpl();
+
+ // Extracts large values as described above, and updates the internal key's
+ // type to kTypeBlobIndex if the value got extracted. Should only be called
+ // for regular values (kTypeValue).
+ void ExtractLargeValueIfNeeded();
+
+ // Relocates valid blobs residing in the oldest blob files if garbage
+ // collection is enabled. Relocated blobs are written to new blob files or
+ // inlined in the LSM tree depending on the current settings (i.e.
+ // enable_blob_files and min_blob_size). Should only be called for blob
+ // references (kTypeBlobIndex).
+ //
+ // Note: the stacked BlobDB implementation's compaction filter based GC
+ // algorithm is also called from here.
+ void GarbageCollectBlobIfNeeded();
+
+ // Invoke compaction filter if needed.
+ // Return true on success, false on failures (e.g.: kIOError).
+ bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+ // Given a sequence number, return the sequence number of the
+ // earliest snapshot that this sequence number is visible in.
+ // The snapshots themselves are arranged in ascending order of
+ // sequence numbers.
+ // Employ a sequential search because the total number of
+ // snapshots are typically small.
+ inline SequenceNumber findEarliestVisibleSnapshot(
+ SequenceNumber in, SequenceNumber* prev_snapshot);
+
+ inline bool KeyCommitted(SequenceNumber sequence) {
+ return snapshot_checker_ == nullptr ||
+ snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) ==
+ SnapshotCheckerResult::kInSnapshot;
+ }
+
+ bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+ bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+ // Extract user-defined timestamp from user key if possible and compare it
+ // with *full_history_ts_low_ if applicable.
+ inline void UpdateTimestampAndCompareWithFullHistoryLow() {
+ if (!timestamp_size_) {
+ return;
+ }
+ Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+ curr_ts_.assign(ts.data(), ts.size());
+ if (full_history_ts_low_) {
+ cmp_with_history_ts_low_ =
+ cmp_->CompareTimestamp(ts, *full_history_ts_low_);
+ }
+ }
+
+ static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
+ const CompactionProxy* compaction);
+ static std::unique_ptr<BlobFetcher> CreateBlobFetcherIfNeeded(
+ const CompactionProxy* compaction);
+ static std::unique_ptr<PrefetchBufferCollection>
+ CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction);
+
+ SequenceIterWrapper input_;
+ const Comparator* cmp_;
+ MergeHelper* merge_helper_;
+ const std::vector<SequenceNumber>* snapshots_;
+ // List of snapshots released during compaction.
+ // findEarliestVisibleSnapshot() find them out from return of
+ // snapshot_checker, and make sure they will not be returned as
+ // earliest visible snapshot of an older value.
+ // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+ std::unordered_set<SequenceNumber> released_snapshots_;
+ const SequenceNumber earliest_write_conflict_snapshot_;
+ const SequenceNumber job_snapshot_;
+ const SnapshotChecker* const snapshot_checker_;
+ Env* env_;
+ SystemClock* clock_;
+ const bool report_detailed_time_;
+ const bool expect_valid_internal_key_;
+ CompactionRangeDelAggregator* range_del_agg_;
+ BlobFileBuilder* blob_file_builder_;
+ std::unique_ptr<CompactionProxy> compaction_;
+ const CompactionFilter* compaction_filter_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>& manual_compaction_canceled_;
+ const bool bottommost_level_;
+ const bool visible_at_tip_;
+ const SequenceNumber earliest_snapshot_;
+
+ std::shared_ptr<Logger> info_log_;
+
+ const bool allow_data_in_errors_;
+
+ const bool enforce_single_del_contracts_;
+
+ // Comes from comparator.
+ const size_t timestamp_size_;
+
+ // Lower bound timestamp to retain full history in terms of user-defined
+ // timestamp. If a key's timestamp is older than full_history_ts_low_, then
+ // the key *may* be eligible for garbage collection (GC). The skipping logic
+ // is in `NextFromInput()` and `PrepareOutput()`.
+ // If nullptr, NO GC will be performed and all history will be preserved.
+ const std::string* const full_history_ts_low_;
+
+ // State
+ //
+ enum ValidContext : uint8_t {
+ kMerge1 = 0,
+ kMerge2 = 1,
+ kParseKeyError = 2,
+ kCurrentKeyUncommitted = 3,
+ kKeepSDAndClearPut = 4,
+ kKeepTsHistory = 5,
+ kKeepSDForConflictCheck = 6,
+ kKeepSDForSnapshot = 7,
+ kKeepSD = 8,
+ kKeepDel = 9,
+ kNewUserKey = 10,
+ };
+
+ struct ValidityInfo {
+ inline bool IsValid() const { return rep & 1; }
+ ValidContext GetContext() const {
+ return static_cast<ValidContext>(rep >> 1);
+ }
+ inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; }
+ inline void Invalidate() { rep = 0; }
+
+ uint8_t rep{0};
+ } validity_info_;
+
+ // Points to a copy of the current compaction iterator output (current_key_)
+ // if valid.
+ Slice key_;
+ // Points to the value in the underlying iterator that corresponds to the
+ // current output.
+ Slice value_;
+ // The status is OK unless compaction iterator encounters a merge operand
+ // while not having a merge operator defined.
+ Status status_;
+ // Stores the user key, sequence number and type of the current compaction
+ // iterator output (or current key in the underlying iterator during
+ // NextFromInput()).
+ ParsedInternalKey ikey_;
+ // Stores whether ikey_.user_key is valid. If set to false, the user key is
+ // not compared against the current key in the underlying iterator.
+ bool has_current_user_key_ = false;
+ // If false, the iterator holds a copy of the current compaction iterator
+ // output (or current key in the underlying iterator during NextFromInput()).
+ bool at_next_ = false;
+
+ IterKey current_key_;
+ Slice current_user_key_;
+ std::string curr_ts_;
+ SequenceNumber current_user_key_sequence_;
+ SequenceNumber current_user_key_snapshot_;
+
+ // True if the iterator has already returned a record for the current key.
+ bool has_outputted_key_ = false;
+
+ // truncated the value of the next key and output it without applying any
+ // compaction rules. This is used for outputting a put after a single delete.
+ bool clear_and_output_next_key_ = false;
+
+ MergeOutputIterator merge_out_iter_;
+ // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+ // merge operands and then releasing them after consuming them.
+ PinnedIteratorsManager pinned_iters_mgr_;
+
+ uint64_t blob_garbage_collection_cutoff_file_number_;
+
+ std::unique_ptr<BlobFetcher> blob_fetcher_;
+ std::unique_ptr<PrefetchBufferCollection> prefetch_buffers_;
+
+ std::string blob_index_;
+ PinnableSlice blob_value_;
+ std::string compaction_filter_value_;
+ InternalKey compaction_filter_skip_until_;
+ // "level_ptrs" holds indices that remember which file of an associated
+ // level we were last checking during the last call to compaction->
+ // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+ // to pick off where it left off since each subcompaction's key range is
+ // increasing so a later call to the function must be looking for a key that
+ // is in or beyond the last file checked during the previous call
+ std::vector<size_t> level_ptrs_;
+ CompactionIterationStats iter_stats_;
+
+ // Used to avoid purging uncommitted values. The application can specify
+ // uncommitted values by providing a SnapshotChecker object.
+ bool current_key_committed_;
+
+ // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_)
+ int cmp_with_history_ts_low_;
+
+ const int level_;
+
+ // True if the previous internal key (same user key)'s sequence number has
+ // just been zeroed out during bottommost compaction.
+ bool last_key_seq_zeroed_{false};
+
+ // True if the current key should be output to the penultimate level if
+ // possible, compaction logic makes the final decision on which level to
+ // output to.
+ bool output_to_penultimate_level_{false};
+
+ // min seqno for preserving the time information.
+ const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+ // min seqno to preclude the data from the last level, if the key seqno larger
+ // than this, it will be output to penultimate level
+ const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+ void AdvanceInputIter() { input_.Next(); }
+
+ void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
+
+ bool IsShuttingDown() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+ }
+
+ bool IsPausingManualCompaction() {
+ // This is a best-effort facility, so memory_order_relaxed is sufficient.
+ return manual_compaction_canceled_.load(std::memory_order_relaxed);
+ }
+};
+
+inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
+ SequenceNumber snapshot) {
+ return ((seq) <= (snapshot) &&
+ (snapshot_checker_ == nullptr ||
+ LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+ SnapshotCheckerResult::kInSnapshot)));
+}
+
+inline bool CompactionIterator::DefinitelyNotInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot) {
+ return ((seq) > (snapshot) ||
+ (snapshot_checker_ != nullptr &&
+ UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+ SnapshotCheckerResult::kNotInSnapshot)));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..81362d792
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,1618 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* /*merge_out*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& /*operand_list*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const override {
+ ADD_FAILURE();
+ return false;
+ }
+ const char* Name() const override {
+ return "CompactionIteratorTest NoMergingMergeOp";
+ }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decision::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ int k = std::atoi(key.ToString().c_str());
+ last_seen.store(k);
+ while (k >= stall_at.load()) {
+ std::this_thread::yield();
+ }
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest StallingFilter";
+ }
+
+ // Wait until the filter sees a key >= k and stalls at that key.
+ // If `exact`, asserts that the seen key is equal to k.
+ void WaitForStall(int k, bool exact = true) {
+ stall_at.store(k);
+ while (last_seen.load() < k) {
+ std::this_thread::yield();
+ }
+ if (exact) {
+ EXPECT_EQ(k, last_seen.load());
+ }
+ }
+
+ // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+ mutable std::atomic<int> stall_at{0};
+ // Last key the filter was called with.
+ mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+ Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return Decision::kRemove;
+ }
+
+ const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public VectorIterator {
+ public:
+ struct Action {
+ enum class Type {
+ SEEK_TO_FIRST,
+ SEEK,
+ NEXT,
+ };
+
+ Type type;
+ std::string arg;
+
+ explicit Action(Type _type, std::string _arg = "")
+ : type(_type), arg(_arg) {}
+
+ bool operator==(const Action& rhs) const {
+ return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+ }
+ };
+
+ LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values)
+ : VectorIterator(keys, values) {
+ current_ = keys_.size();
+ }
+
+ void SeekToFirst() override {
+ log.emplace_back(Action::Type::SEEK_TO_FIRST);
+ VectorIterator::SeekToFirst();
+ }
+ void SeekToLast() override { assert(false); }
+
+ void Seek(const Slice& target) override {
+ log.emplace_back(Action::Type::SEEK, target.ToString());
+ VectorIterator::Seek(target);
+ }
+
+ void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+ void Next() override {
+ assert(Valid());
+ log.emplace_back(Action::Type::NEXT);
+ VectorIterator::Next();
+ }
+ void Prev() override { assert(false); }
+
+ Slice key() const override {
+ assert(Valid());
+ return VectorIterator::key();
+ }
+ Slice value() const override {
+ assert(Valid());
+ return VectorIterator::value();
+ }
+
+ std::vector<Action> log;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+ int level() const override { return 0; }
+
+ bool KeyNotExistsBeyondOutputLevel(
+ const Slice& /*user_key*/,
+ std::vector<size_t>* /*level_ptrs*/) const override {
+ return is_bottommost_level || key_not_exists_beyond_output_level;
+ }
+
+ bool bottommost_level() const override { return is_bottommost_level; }
+
+ int number_levels() const override { return 1; }
+
+ Slice GetLargestUserKey() const override {
+ return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ }
+
+ bool allow_ingest_behind() const override { return is_allow_ingest_behind; }
+
+ bool allow_mmap_reads() const override { return false; }
+
+ bool enable_blob_garbage_collection() const override { return false; }
+
+ double blob_garbage_collection_age_cutoff() const override { return 0.0; }
+
+ uint64_t blob_compaction_readahead_size() const override { return 0; }
+
+ const Version* input_version() const override { return nullptr; }
+
+ bool DoesInputReferenceBlobFiles() const override { return false; }
+
+ const Compaction* real_compaction() const override { return nullptr; }
+
+ bool SupportsPerKeyPlacement() const override {
+ return supports_per_key_placement;
+ }
+
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+ return (!key.starts_with("unsafe_pb"));
+ }
+
+ bool key_not_exists_beyond_output_level = false;
+
+ bool is_bottommost_level = false;
+
+ bool is_allow_ingest_behind = false;
+
+ bool supports_per_key_placement = false;
+};
+
+// A simplified snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+ explicit TestSnapshotChecker(
+ SequenceNumber last_committed_sequence,
+ const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots =
+ {{}})
+ : last_committed_sequence_(last_committed_sequence),
+ snapshots_(snapshots) {}
+
+ SnapshotCheckerResult CheckInSnapshot(
+ SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+ if (snapshot_seq == kMaxSequenceNumber) {
+ return seq <= last_committed_sequence_
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+ assert(snapshots_.count(snapshot_seq) > 0);
+ return seq <= snapshots_.at(snapshot_seq)
+ ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+ }
+
+ private:
+ SequenceNumber last_committed_sequence_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+// bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+ CompactionIteratorTest()
+ : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+ explicit CompactionIteratorTest(const Comparator* ucmp)
+ : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {}
+
+ void InitIterators(
+ const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+ const std::vector<std::string>& range_del_ks,
+ const std::vector<std::string>& range_del_vs,
+ SequenceNumber last_sequence,
+ SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+ MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ bool key_not_exists_beyond_output_level = false,
+ const std::string* full_history_ts_low = nullptr) {
+ std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+ new VectorIterator(range_del_ks, range_del_vs, &icmp_));
+ auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+ std::move(unfragmented_range_del_iter), icmp_);
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+ new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+ kMaxSequenceNumber));
+ range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+ range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+ std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+ if (filter || bottommost_level || key_not_exists_beyond_output_level) {
+ compaction_proxy_ = new FakeCompaction();
+ compaction_proxy_->is_bottommost_level = bottommost_level;
+ compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
+ compaction_proxy_->key_not_exists_beyond_output_level =
+ key_not_exists_beyond_output_level;
+ compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
+ compaction.reset(compaction_proxy_);
+ }
+ bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+ if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+ snapshot_checker_.reset(
+ new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+ }
+ merge_helper_.reset(
+ new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+ 0 /*latest_snapshot*/, snapshot_checker_.get(),
+ 0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+ if (c_iter_) {
+ // Since iter_ is still used in ~CompactionIterator(), we call
+ // ~CompactionIterator() first.
+ c_iter_.reset();
+ }
+ iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+ iter_->SeekToFirst();
+ c_iter_.reset(new CompactionIterator(
+ iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+ earliest_write_conflict_snapshot, kMaxSequenceNumber,
+ snapshot_checker_.get(), Env::Default(),
+ false /* report_detailed_time */, false, range_del_agg_.get(),
+ nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
+ true /*enforce_single_del_contracts*/,
+ /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
+ std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
+ full_history_ts_low));
+ }
+
+ void AddSnapshot(SequenceNumber snapshot,
+ SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+ snapshots_.push_back(snapshot);
+ snapshot_map_[snapshot] = last_visible_seq;
+ }
+
+ virtual bool UseSnapshotChecker() const { return false; }
+
+ virtual bool AllowIngestBehind() const { return false; }
+
+ virtual bool SupportsPerKeyPlacement() const { return false; }
+
+ void RunTest(
+ const std::vector<std::string>& input_keys,
+ const std::vector<std::string>& input_values,
+ const std::vector<std::string>& expected_keys,
+ const std::vector<std::string>& expected_values,
+ SequenceNumber last_committed_seq = kMaxSequenceNumber,
+ MergeOperator* merge_operator = nullptr,
+ CompactionFilter* compaction_filter = nullptr,
+ bool bottommost_level = false,
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ bool key_not_exists_beyond_output_level = false,
+ const std::string* full_history_ts_low = nullptr) {
+ InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+ last_committed_seq, merge_operator, compaction_filter,
+ bottommost_level, earliest_write_conflict_snapshot,
+ key_not_exists_beyond_output_level, full_history_ts_low);
+ c_iter_->SeekToFirst();
+ for (size_t i = 0; i < expected_keys.size(); i++) {
+ std::string info = "i = " + std::to_string(i);
+ ASSERT_TRUE(c_iter_->Valid()) << info;
+ ASSERT_OK(c_iter_->status()) << info;
+ ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+ ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+ c_iter_->Next();
+ }
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+ }
+
+ void ClearSnapshots() {
+ snapshots_.clear();
+ snapshot_map_.clear();
+ }
+
+ const Comparator* cmp_;
+ const InternalKeyComparator icmp_;
+ std::vector<SequenceNumber> snapshots_;
+ // A map of valid snapshot to last visible sequence to the snapshot.
+ std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+ std::unique_ptr<MergeHelper> merge_helper_;
+ std::unique_ptr<LoggingForwardVectorIterator> iter_;
+ std::unique_ptr<CompactionIterator> c_iter_;
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+ std::atomic<bool> shutting_down_{false};
+ const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+ FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue)},
+ {"", "val"}, {}, {}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+ InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+ test::KeyStr("a", 3, kTypeValue, true),
+ test::KeyStr("b", 10, kTypeValue)},
+ {"", "val", "val2"}, {}, {}, 10);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+ c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+ InitIterators({test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("morning", 2, kTypeValue),
+ test::KeyStr("night", 3, kTypeValue)},
+ {"zao", "zao", "wan"},
+ {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+ AddSnapshot(10);
+ std::vector<std::string> ks1;
+ ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+ std::vector<std::string> vs1{"mz"};
+ std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+ test::KeyStr("morning", 5, kTypeValue),
+ test::KeyStr("night", 40, kTypeValue),
+ test::KeyStr("night", 20, kTypeValue)};
+ std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+ InitIterators(ks2, vs2, ks1, vs1, 40);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* skip_until) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("av50", v);
+ return Decision::kKeep;
+ }
+ if (k == "b") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("bv60", v);
+ *skip_until = "d+";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "e") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("em71", v);
+ return Decision::kKeep;
+ }
+ if (k == "f") {
+ if (v == "fm65") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "f";
+ } else {
+ EXPECT_EQ("fm30", v);
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ *skip_until = "g+";
+ }
+ return Decision::kRemoveAndSkipUntil;
+ }
+ if (k == "h") {
+ EXPECT_EQ(ValueType::kValue, t);
+ EXPECT_EQ("hv91", v);
+ return Decision::kKeep;
+ }
+ if (k == "i") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("im95", v);
+ *skip_until = "z";
+ return Decision::kRemoveAndSkipUntil;
+ }
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+ }
+ };
+
+ NoMergingMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ {test::KeyStr("a", 50, kTypeValue), // keep
+ test::KeyStr("a", 45, kTypeMerge),
+ test::KeyStr("b", 60, kTypeValue), // skip to "d+"
+ test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+ test::KeyStr("d", 70, kTypeMerge),
+ test::KeyStr("e", 71, kTypeMerge), // keep
+ test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep
+ test::KeyStr("f", 30, kTypeMerge), // skip to "g+"
+ test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+ test::KeyStr("h", 91, kTypeValue), // keep
+ test::KeyStr("i", 95, kTypeMerge), // skip to "z"
+ test::KeyStr("j", 99, kTypeValue)},
+ {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+ "fv25", "gv90", "hv91", "im95", "jv99"},
+ {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+ // Compaction should output just "a", "e" and "h" keys.
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("av50", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("em71", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+ ASSERT_EQ("hv91", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+
+ // Check that the compaction iterator did the correct sequence of calls on
+ // the underlying iterator.
+ using A = LoggingForwardVectorIterator::Action;
+ using T = A::Type;
+ std::vector<A> expected_actions = {
+ A(T::SEEK_TO_FIRST),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+ A(T::NEXT),
+ A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+ ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+ test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ // Don't leave tombstones (kTypeDeletion) for filtered keys.
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ EXPECT_FALSE(c_iter_->Valid());
+ EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+ NoMergingMergeOp merge_op;
+ StallingFilter filter;
+ InitIterators(
+ {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+ test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+ {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ &merge_op, &filter);
+ compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+ std::atomic<bool> seek_done{false};
+ ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+ c_iter_->SeekToFirst();
+ ASSERT_FALSE(c_iter_->Valid());
+ ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+ seek_done.store(true);
+ });
+
+ // Let key 1 through.
+ filter.WaitForStall(1);
+
+ // Shutdown during compaction filter call for key 2.
+ filter.WaitForStall(2);
+ shutting_down_.store(true);
+ EXPECT_FALSE(seek_done.load());
+
+ // Unstall filter and wait for SeekToFirst() to return.
+ filter.stall_at.store(3);
+ compaction_thread.join();
+ assert(seek_done.load());
+
+ // Check that filter was never called again.
+ EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+ class Filter : public CompactionFilter {
+ Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ std::string k = key.ToString();
+ std::string v = existing_value.ToString();
+
+ // See InitIterators() call below for the sequence of keys and their
+ // filtering decisions. Here we closely assert that compaction filter is
+ // called with the expected keys and only them, and with the right values.
+ if (k == "a") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ EXPECT_EQ("av1", v);
+ return Decision::kKeep;
+ } else if (k == "b") {
+ EXPECT_EQ(ValueType::kMergeOperand, t);
+ return Decision::kKeep;
+ } else if (k == "c") {
+ return Decision::kKeep;
+ }
+
+ ADD_FAILURE();
+ return Decision::kKeep;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest.SingleMergeOperand::Filter";
+ }
+ };
+
+ class SingleMergeOp : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ // See InitIterators() call below for why "c" is the only key for which
+ // FullMergeV2 should be called.
+ EXPECT_EQ("c", merge_in.key.ToString());
+
+ std::string temp_value;
+ if (merge_in.existing_value != nullptr) {
+ temp_value = merge_in.existing_value->ToString();
+ }
+
+ for (auto& operand : merge_in.operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ merge_out->new_value = temp_value;
+
+ return true;
+ }
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ std::string string_key = key.ToString();
+ EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+ if (string_key == "a") {
+ EXPECT_EQ(1, operand_list.size());
+ } else if (string_key == "b") {
+ EXPECT_EQ(2, operand_list.size());
+ }
+
+ std::string temp_value;
+ for (auto& operand : operand_list) {
+ temp_value.append(operand.ToString());
+ }
+ swap(temp_value, *new_value);
+
+ return true;
+ }
+
+ const char* Name() const override {
+ return "CompactionIteratorTest SingleMergeOp";
+ }
+
+ bool AllowSingleOperand() const override { return true; }
+ };
+
+ SingleMergeOp merge_op;
+ Filter filter;
+ InitIterators(
+ // a should invoke PartialMergeMulti with a single merge operand.
+ {test::KeyStr("a", 50, kTypeMerge),
+ // b should invoke PartialMergeMulti with two operands.
+ test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+ // c should invoke FullMerge due to kTypeValue at the beginning.
+ test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+ {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+ kMaxSequenceNumber, &merge_op, &filter);
+
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+ ASSERT_EQ("av1", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+ {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest(
+ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
+ test::KeyStr("b", 1, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
+ {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+ AddSnapshot(1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion)},
+ {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+ kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+ test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+ {"a4", "a3", "a2", "b1"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)},
+ {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+ merge_op.get(), nullptr /*compaction_filter*/,
+ true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+ testing::Values(true, false));
+
+class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
+ public:
+ bool SupportsPerKeyPlacement() const override { return true; }
+};
+
+TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ latest_cold_seq = 5;
+
+ InitIterators(
+ {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
+ test::KeyStr("c", 5, kTypeValue)},
+ {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ nullptr, nullptr, true);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+
+ // the first 2 keys are hot, which should has
+ // `output_to_penultimate_level()==true` and seq num not zeroed out
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ // `a` is cold data, which should be output to bottommost
+ ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
+ ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
+ AddSnapshot(5);
+
+ InitIterators(
+ {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
+ test::KeyStr("b", 5, kTypeValue)},
+ {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+ nullptr, nullptr, true);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+
+ // The first key and the tombstone are within snapshot, which should output
+ // to the penultimate level (and seq num cannot be zeroed out).
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->Valid());
+ // `a` is not protected by the snapshot, the sequence number is zero out and
+ // should output bottommost
+ ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
+ ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+ c_iter_->Next();
+ ASSERT_OK(c_iter_->status());
+ ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ latest_cold_seq = 6;
+
+ AddSnapshot(5);
+
+ InitIterators({test::KeyStr("a", 7, kTypeValue),
+ test::KeyStr("unsafe_pb", 6, kTypeValue),
+ test::KeyStr("c", 5, kTypeValue)},
+ {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
+ kMaxSequenceNumber, nullptr, nullptr, true);
+ c_iter_->SeekToFirst();
+ ASSERT_TRUE(c_iter_->Valid());
+
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+ // the 2nd key is unsafe to output_to_penultimate_level, but it's within
+ // snapshot so for per_key_placement feature it has to be outputted to the
+ // penultimate level. which is a corruption. We should never see
+ // such case as the data with seq num (within snapshot) should always come
+ // from higher compaction input level, which makes it safe to
+ // output_to_penultimate_level.
+ c_iter_->Next();
+ ASSERT_TRUE(c_iter_->status().IsCorruption());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
+ PerKeyPlacementCompIteratorTest,
+ testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+ : public CompactionIteratorTest {
+ public:
+ bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Value) {
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Deletion) {
+ RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_Merge) {
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+ {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_SingleDelete) {
+ RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("foo", 2, kTypeSingleDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ PreserveUncommittedKeys_BlobIndex) {
+ RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex)},
+ {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeDeletion),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+ AddSnapshot(2, 1);
+ AddSnapshot(4, 3);
+ auto merge_op = MergeOperators::CreateStringAppendOperator();
+ RunTest(
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+ test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+ test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+ {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ DedupSameSnapshot_SingleDeletion) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("foo", 4, kTypeValue),
+ test::KeyStr("foo", 3, kTypeSingleDeletion),
+ test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+ {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 2, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v2", "v1"},
+ {test::KeyStr("foo", 4, kTypeBlobIndex),
+ test::KeyStr("foo", 3, kTypeBlobIndex),
+ test::KeyStr("foo", 1, kTypeBlobIndex)},
+ {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"},
+ {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+ test::KeyStr("c", 3, kTypeValue)},
+ {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest(
+ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+ test::KeyStr("c", 3, kTypeDeletion)},
+ {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 4, kTypeDeletion),
+ test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""},
+ {test::KeyStr("a", 4, kTypeDeletion),
+ test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+ {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+ test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", "", ""},
+ {test::KeyStr("b", 2, kTypeSingleDeletion),
+ test::KeyStr("c", 3, kTypeSingleDeletion)},
+ {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+ nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+ true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ SingleDeleteAcrossSnapshotBoundary) {
+ AddSnapshot(2, 1);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, false /*bottommost_level*/,
+ 2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a blob index. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking_BlobIndex) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeBlobIndex)},
+ {"", "fake_blob_index"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ nullptr /*compaction_filter*/, false /*bottommost_level*/,
+ 2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a wide-column entity. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ KeepSingleDeletionForWriteConflictChecking_WideColumnEntity) {
+ AddSnapshot(2, 0);
+ RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeWideColumnEntity)},
+ {"", "fake_entity"},
+ {test::KeyStr("a", 2, kTypeSingleDeletion),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */,
+ nullptr /* compaction_filter */, false /* bottommost_level */,
+ 2 /* earliest_write_conflict_snapshot */);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+// * Convert the latest value to deletion, and/or
+// * if latest value is a merge, apply filter to all subsequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+ {"v2", "v1", "v3", "v4"},
+ {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+ test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+ {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+ nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+ {"", "v1"},
+ {test::KeyStr("a", 2, kTypeDeletion),
+ test::KeyStr("a", 1, kTypeDeletion)},
+ {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+ compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+ CompactionFilter_PartialMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeMerge)},
+ {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+ 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ std::unique_ptr<CompactionFilter> compaction_filter(
+ new FilterAllKeysCompactionFilter());
+ RunTest(
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+ test::KeyStr("a", 1, kTypeValue)},
+ {"v3", "v2", "v1"},
+ {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+ {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+ compaction_filter.get());
+}
+
+// Tests how CompactionIterator work together with AllowIngestBehind.
+class CompactionIteratorWithAllowIngestBehindTest
+ : public CompactionIteratorTest {
+ public:
+ bool AllowIngestBehind() const override { return true; }
+};
+
+// When allow_ingest_behind is set, compaction iterator is not targeting
+// the bottommost level since there is no guarantee there won't be further
+// data ingested under the compaction output in future.
+TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+ test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+ {"a4", "a3", "a2", "b1"},
+ {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+ {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+ merge_op.get(), nullptr /*compaction_filter*/,
+ true /*bottomost_level*/);
+}
+
+TEST_P(CompactionIteratorWithAllowIngestBehindTest,
+ MergeToPutIfEncounteredPutAtBottom) {
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendOperator();
+ RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+ test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+ {"a4", "a3", "a2", "b1"},
+ {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+ {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+ merge_op.get(), nullptr /*compaction_filter*/,
+ true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance,
+ CompactionIteratorWithAllowIngestBehindTest,
+ testing::Values(true, false));
+
+class CompactionIteratorTsGcTest : public CompactionIteratorTest {
+ public:
+ CompactionIteratorTsGcTest()
+ : CompactionIteratorTest(test::BytewiseComparatorWithU64TsWrapper()) {}
+};
+
+TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue),
+ test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3,
+ kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+ const std::vector<std::string> input_values = {"a3", "", "b2"};
+ std::string full_history_ts_low;
+ // All keys' timestamps are newer than or equal to 102, thus none of them
+ // will be eligible for GC.
+ PutFixed64(&full_history_ts_low, 102);
+ const std::vector<std::string>& expected_keys = input_keys;
+ const std::vector<std::string>& expected_values = input_values;
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const std::pair<bool, bool>& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(10002, user_key, 102, kTypeMerge),
+ test::KeyStr(10001, user_key, 101, kTypeMerge),
+ test::KeyStr(10000, user_key, 100, kTypeValue)};
+ const std::vector<std::string> input_values = {"2", "1", "a0"};
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendTESTOperator();
+ const std::vector<std::string>& expected_keys = input_keys;
+ const std::vector<std::string>& expected_values = input_values;
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const auto& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+ /*compaction_filter=*/nullptr, bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level,
+ /*full_history_ts_low=*/nullptr);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4,
+ kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue),
+ test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "a1", "b5"};
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+ {
+ // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+ // be preserved.
+ AddSnapshot(3);
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[3]};
+ const std::vector<std::string> expected_values = {"", "a2", "b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ ClearSnapshots();
+ }
+ {
+ // No snapshot, the deletion marker should be preserved because the user
+ // key may appear beyond output level.
+ const std::vector<std::string> expected_keys = {input_keys[0],
+ input_keys[3]};
+ const std::vector<std::string> expected_values = {"", "b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+ {
+ // No snapshot, the deletion marker can be dropped because the user key
+ // does not appear in higher levels.
+ const std::vector<std::string> expected_keys = {input_keys[3]};
+ const std::vector<std::string> expected_values = {"b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) {
+ constexpr char user_key[][2] = {"a", "f"};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+ test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+ test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+ test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+ test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+ test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+ test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+ kTypeDeletionWithTimestamp)};
+ const std::vector<std::string> input_values = {"25", "19", "18", "16",
+ "19", "17", ""};
+ std::shared_ptr<MergeOperator> merge_op =
+ MergeOperators::CreateStringAppendTESTOperator();
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 20000);
+
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+
+ {
+ AddSnapshot(1600);
+ AddSnapshot(1900);
+ const std::vector<std::string> expected_keys = {
+ test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+ test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+ test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+ test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+ test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+ test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+ test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+ kTypeDeletionWithTimestamp)};
+ const std::vector<std::string> expected_values = {"25", "19", "18", "16",
+ "19", "17", ""};
+ for (const auto& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ auto expected_keys_copy = expected_keys;
+ auto expected_values_copy = expected_values;
+ if (bottommost_level || key_not_exists_beyond_output_level) {
+ // the kTypeDeletionWithTimestamp will be dropped
+ expected_keys_copy.pop_back();
+ expected_values_copy.pop_back();
+ if (bottommost_level) {
+ // seq zero
+ expected_keys_copy[3] =
+ test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue);
+ }
+ }
+ RunTest(input_keys, input_values, expected_keys_copy,
+ expected_values_copy,
+ /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+ /*compaction_filter=*/nullptr, bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+ ClearSnapshots();
+ }
+
+ // No snapshots
+ {
+ const std::vector<std::string> expected_keys = {
+ test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue),
+ test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)};
+ const std::vector<std::string> expected_values = {"16,18,19,25", "17,19"};
+ for (const auto& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ auto expected_keys_copy = expected_keys;
+ auto expected_values_copy = expected_values;
+ if (bottommost_level) {
+ expected_keys_copy[1] =
+ test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue);
+ }
+ RunTest(input_keys, input_values, expected_keys_copy,
+ expected_values_copy,
+ /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+ /*compaction_filter=*/nullptr, bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "a1", "a0"};
+ {
+ std::string full_history_ts_low;
+ // Keys whose timestamps larger than or equal to 102 will be preserved.
+ PutFixed64(&full_history_ts_low, 102);
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[2]};
+ const std::vector<std::string> expected_values = {"", input_values[1],
+ input_values[2]};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, DropTombstones) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+ const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+ const std::vector<std::string> expected_values = {"", "a2"};
+
+ // Take a snapshot at seq 2.
+ AddSnapshot(2);
+
+ {
+ // Non-bottommost level, but key does not exist beyond output level.
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_sequence=*/kMaxSequenceNumber,
+ /*merge_op=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+ }
+ {
+ // Bottommost level
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/true,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, RewriteTs) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[2],
+ test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)};
+ const std::vector<std::string> expected_values = {"", "a2", "", "a0"};
+
+ AddSnapshot(1);
+ AddSnapshot(2);
+
+ {
+ // Bottommost level and need to rewrite both ts and seq.
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/true,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a3", "b2"};
+ std::string full_history_ts_low;
+ // All keys' timestamps are newer than or equal to 102, thus none of them
+ // will be eligible for GC.
+ PutFixed64(&full_history_ts_low, 102);
+ const std::vector<std::string>& expected_keys = input_keys;
+ const std::vector<std::string>& expected_values = input_values;
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const std::pair<bool, bool>& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) {
+ constexpr char user_key[] = "a";
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+ const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+ const std::vector<std::string> expected_values = {"", "a2"};
+
+ // Take a snapshot at seq 2.
+ AddSnapshot(2);
+ {
+ const std::vector<std::pair<bool, bool>> params = {
+ {false, false}, {false, true}, {true, true}};
+ for (const std::pair<bool, bool>& param : params) {
+ const bool bottommost_level = param.first;
+ const bool key_not_exists_beyond_output_level = param.second;
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, 102);
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ bottommost_level,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ key_not_exists_beyond_output_level, &full_history_ts_low);
+ }
+ }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) {
+ constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+ const std::vector<std::string> input_keys = {
+ test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+ test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+ test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+ const std::vector<std::string> input_values = {"", "a2", "b5"};
+ std::string full_history_ts_low;
+ PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+ {
+ // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+ // be preserved.
+ AddSnapshot(3);
+ const std::vector<std::string> expected_keys = {
+ input_keys[0], input_keys[1], input_keys[2]};
+ const std::vector<std::string> expected_values = {"", "a2", "b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ ClearSnapshots();
+ }
+ {
+ // No snapshot.
+ const std::vector<std::string> expected_keys = {input_keys[2]};
+ const std::vector<std::string> expected_values = {"b5"};
+ RunTest(input_keys, input_values, expected_keys, expected_values,
+ /*last_committed_seq=*/kMaxSequenceNumber,
+ /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+ /*bottommost_level=*/false,
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance,
+ CompactionIteratorTsGcTest,
+ testing::Values(true, false));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..1da1bcda8
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,2060 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <optional>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_counting_iterator.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/builder.h"
+#include "db/compaction/clipping_iterator.h"
+#include "db/compaction/compaction_state.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/history_trimming_iterator.h"
+#include "db/log_writer.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+ switch (compaction_reason) {
+ case CompactionReason::kUnknown:
+ return "Unknown";
+ case CompactionReason::kLevelL0FilesNum:
+ return "LevelL0FilesNum";
+ case CompactionReason::kLevelMaxLevelSize:
+ return "LevelMaxLevelSize";
+ case CompactionReason::kUniversalSizeAmplification:
+ return "UniversalSizeAmplification";
+ case CompactionReason::kUniversalSizeRatio:
+ return "UniversalSizeRatio";
+ case CompactionReason::kUniversalSortedRunNum:
+ return "UniversalSortedRunNum";
+ case CompactionReason::kFIFOMaxSize:
+ return "FIFOMaxSize";
+ case CompactionReason::kFIFOReduceNumFiles:
+ return "FIFOReduceNumFiles";
+ case CompactionReason::kFIFOTtl:
+ return "FIFOTtl";
+ case CompactionReason::kManualCompaction:
+ return "ManualCompaction";
+ case CompactionReason::kFilesMarkedForCompaction:
+ return "FilesMarkedForCompaction";
+ case CompactionReason::kBottommostFiles:
+ return "BottommostFiles";
+ case CompactionReason::kTtl:
+ return "Ttl";
+ case CompactionReason::kFlush:
+ return "Flush";
+ case CompactionReason::kExternalSstIngestion:
+ return "ExternalSstIngestion";
+ case CompactionReason::kPeriodicCompaction:
+ return "PeriodicCompaction";
+ case CompactionReason::kChangeTemperature:
+ return "ChangeTemperature";
+ case CompactionReason::kForcedBlobGC:
+ return "ForcedBlobGC";
+ case CompactionReason::kRoundRobinTtl:
+ return "RoundRobinTtl";
+ case CompactionReason::kNumOfReasons:
+ // fall through
+ default:
+ assert(false);
+ return "Invalid";
+ }
+}
+
+const char* GetCompactionPenultimateOutputRangeTypeString(
+ Compaction::PenultimateOutputRangeType range_type) {
+ switch (range_type) {
+ case Compaction::PenultimateOutputRangeType::kNotSupported:
+ return "NotSupported";
+ case Compaction::PenultimateOutputRangeType::kFullRange:
+ return "FullRange";
+ case Compaction::PenultimateOutputRangeType::kNonLastRange:
+ return "NonLastRange";
+ case Compaction::PenultimateOutputRangeType::kDisabled:
+ return "Disabled";
+ default:
+ assert(false);
+ return "Invalid";
+ }
+}
+
+CompactionJob::CompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+ VersionSet* versions, const std::atomic<bool>* shutting_down,
+ LogBuffer* log_buffer, FSDirectory* db_directory,
+ FSDirectory* output_directory, FSDirectory* blob_output_directory,
+ Statistics* stats, InstrumentedMutex* db_mutex,
+ ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, JobContext* job_context,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
+ CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id, const std::string& db_session_id,
+ std::string full_history_ts_low, std::string trim_ts,
+ BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
+ int* bg_bottom_compaction_scheduled)
+ : compact_(new CompactionState(compaction)),
+ compaction_stats_(compaction->compaction_reason(), 1),
+ db_options_(db_options),
+ mutable_db_options_copy_(mutable_db_options),
+ log_buffer_(log_buffer),
+ output_directory_(output_directory),
+ stats_(stats),
+ bottommost_level_(false),
+ write_hint_(Env::WLTH_NOT_SET),
+ compaction_job_stats_(compaction_job_stats),
+ job_id_(job_id),
+ dbname_(dbname),
+ db_id_(db_id),
+ db_session_id_(db_session_id),
+ file_options_(file_options),
+ env_(db_options.env),
+ io_tracer_(io_tracer),
+ fs_(db_options.fs, io_tracer),
+ file_options_for_read_(
+ fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+ versions_(versions),
+ shutting_down_(shutting_down),
+ manual_compaction_canceled_(manual_compaction_canceled),
+ db_directory_(db_directory),
+ blob_output_directory_(blob_output_directory),
+ db_mutex_(db_mutex),
+ db_error_handler_(db_error_handler),
+ existing_snapshots_(std::move(existing_snapshots)),
+ earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+ snapshot_checker_(snapshot_checker),
+ job_context_(job_context),
+ table_cache_(std::move(table_cache)),
+ event_logger_(event_logger),
+ paranoid_file_checks_(paranoid_file_checks),
+ measure_io_stats_(measure_io_stats),
+ thread_pri_(thread_pri),
+ full_history_ts_low_(std::move(full_history_ts_low)),
+ trim_ts_(std::move(trim_ts)),
+ blob_callback_(blob_callback),
+ extra_num_subcompaction_threads_reserved_(0),
+ bg_compaction_scheduled_(bg_compaction_scheduled),
+ bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
+ assert(compaction_job_stats_ != nullptr);
+ assert(log_buffer_ != nullptr);
+
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+ ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+ assert(compact_ == nullptr);
+ ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+ const auto* cfd = compact_->compaction->column_family_data();
+ ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+ db_options_.enable_thread_tracking);
+
+ ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+ job_id_);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+ (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+ compact_->compaction->output_level());
+
+ // In the current design, a CompactionJob is always created
+ // for non-trivial compaction.
+ assert(compaction->IsTrivialMove() == false ||
+ compaction->is_manual_compaction() == true);
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_PROP_FLAGS,
+ compaction->is_manual_compaction() +
+ (compaction->deletion_compaction() << 1));
+
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+ compaction->CalculateTotalInputSize());
+
+ IOSTATS_RESET(bytes_written);
+ IOSTATS_RESET(bytes_read);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+ ThreadStatusUtil::SetThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+ // Set the thread operation after operation properties
+ // to ensure GetThreadList() can always show them all together.
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ compaction_job_stats_->is_manual_compaction =
+ compaction->is_manual_compaction();
+ compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
+}
+
+void CompactionJob::Prepare() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+ // Generate file_levels_ for compaction before making Iterator
+ auto* c = compact_->compaction;
+ ColumnFamilyData* cfd = c->column_family_data();
+ assert(cfd != nullptr);
+ assert(cfd->current()->storage_info()->NumLevelFiles(
+ compact_->compaction->level()) > 0);
+
+ write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
+ bottommost_level_ = c->bottommost_level();
+
+ if (c->ShouldFormSubcompactions()) {
+ StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
+ GenSubcompactionBoundaries();
+ }
+ if (boundaries_.size() > 1) {
+ for (size_t i = 0; i <= boundaries_.size(); i++) {
+ compact_->sub_compact_states.emplace_back(
+ c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
+ (i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
+ : std::nullopt,
+ static_cast<uint32_t>(i));
+ // assert to validate that boundaries don't have same user keys (without
+ // timestamp part).
+ assert(i == 0 || i == boundaries_.size() ||
+ cfd->user_comparator()->CompareWithoutTimestamp(
+ boundaries_[i - 1], boundaries_[i]) < 0);
+ }
+ RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+ compact_->sub_compact_states.size());
+ } else {
+ compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
+ /*sub_job_id*/ 0);
+ }
+
+ // collect all seqno->time information from the input files which will be used
+ // to encode seqno->time to the output files.
+ uint64_t preserve_time_duration =
+ std::max(c->immutable_options()->preserve_internal_time_seconds,
+ c->immutable_options()->preclude_last_level_data_seconds);
+
+ if (preserve_time_duration > 0) {
+ // setup seqno_time_mapping_
+ seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
+ for (const auto& each_level : *c->inputs()) {
+ for (const auto& fmd : each_level.files) {
+ std::shared_ptr<const TableProperties> tp;
+ Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
+ if (s.ok()) {
+ seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
+ .PermitUncheckedError();
+ seqno_time_mapping_.Add(fmd->fd.smallest_seqno,
+ fmd->oldest_ancester_time);
+ }
+ }
+ }
+
+ auto status = seqno_time_mapping_.Sort();
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Invalid sequence number to time mapping: Status: %s",
+ status.ToString().c_str());
+ }
+ int64_t _current_time = 0;
+ status = db_options_.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get current time in compaction: Status: %s",
+ status.ToString().c_str());
+ // preserve all time information
+ preserve_time_min_seqno_ = 0;
+ preclude_last_level_min_seqno_ = 0;
+ } else {
+ seqno_time_mapping_.TruncateOldEntries(_current_time);
+ uint64_t preserve_time =
+ static_cast<uint64_t>(_current_time) > preserve_time_duration
+ ? _current_time - preserve_time_duration
+ : 0;
+ preserve_time_min_seqno_ =
+ seqno_time_mapping_.GetOldestSequenceNum(preserve_time);
+ if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
+ uint64_t preclude_last_level_time =
+ static_cast<uint64_t>(_current_time) >
+ c->immutable_options()->preclude_last_level_data_seconds
+ ? _current_time -
+ c->immutable_options()->preclude_last_level_data_seconds
+ : 0;
+ preclude_last_level_min_seqno_ =
+ seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time);
+ }
+ }
+ }
+}
+
+uint64_t CompactionJob::GetSubcompactionsLimit() {
+ return extra_num_subcompaction_threads_reserved_ +
+ std::max(
+ std::uint64_t(1),
+ static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
+}
+
+void CompactionJob::AcquireSubcompactionResources(
+ int num_extra_required_subcompactions) {
+ TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
+ TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
+ int max_db_compactions =
+ DBImpl::GetBGJobLimits(
+ mutable_db_options_copy_.max_background_flushes,
+ mutable_db_options_copy_.max_background_compactions,
+ mutable_db_options_copy_.max_background_jobs,
+ versions_->GetColumnFamilySet()
+ ->write_controller()
+ ->NeedSpeedupCompaction())
+ .max_compactions;
+ InstrumentedMutexLock l(db_mutex_);
+ // Apply min function first since We need to compute the extra subcompaction
+ // against compaction limits. And then try to reserve threads for extra
+ // subcompactions. The actual number of reserved threads could be less than
+ // the desired number.
+ int available_bg_compactions_against_db_limit =
+ std::max(max_db_compactions - *bg_compaction_scheduled_ -
+ *bg_bottom_compaction_scheduled_,
+ 0);
+ // Reservation only supports backgrdoun threads of which the priority is
+ // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
+ // origin thread_pri_ is higher than that. Similar to ReleaseThreads().
+ extra_num_subcompaction_threads_reserved_ =
+ env_->ReserveThreads(std::min(num_extra_required_subcompactions,
+ available_bg_compactions_against_db_limit),
+ std::min(thread_pri_, Env::Priority::HIGH));
+
+ // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+ // depending on if this compaction has the bottommost priority
+ if (thread_pri_ == Env::Priority::BOTTOM) {
+ *bg_bottom_compaction_scheduled_ +=
+ extra_num_subcompaction_threads_reserved_;
+ } else {
+ *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
+ }
+}
+
+void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
+ // Do nothing when we have zero resources to shrink
+ if (num_extra_resources == 0) return;
+ db_mutex_->Lock();
+ // We cannot release threads more than what we reserved before
+ int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
+ (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
+ // Update the number of reserved threads and the number of background
+ // scheduled compactions for this compaction job
+ extra_num_subcompaction_threads_reserved_ -=
+ extra_num_subcompaction_threads_released;
+ // TODO (zichen): design a test case with new subcompaction partitioning
+ // when the number of actual partitions is less than the number of planned
+ // partitions
+ assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
+ // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+ // depending on if this compaction has the bottommost priority
+ if (thread_pri_ == Env::Priority::BOTTOM) {
+ *bg_bottom_compaction_scheduled_ -=
+ extra_num_subcompaction_threads_released;
+ } else {
+ *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
+ }
+ db_mutex_->Unlock();
+ TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
+}
+
+void CompactionJob::ReleaseSubcompactionResources() {
+ if (extra_num_subcompaction_threads_reserved_ == 0) {
+ return;
+ }
+ {
+ InstrumentedMutexLock l(db_mutex_);
+ // The number of reserved threads becomes larger than 0 only if the
+ // compaction prioity is round robin and there is no sufficient
+ // sub-compactions available
+
+ // The scheduled compaction must be no less than 1 + extra number
+ // subcompactions using acquired resources since this compaction job has not
+ // finished yet
+ assert(*bg_bottom_compaction_scheduled_ >=
+ 1 + extra_num_subcompaction_threads_reserved_ ||
+ *bg_compaction_scheduled_ >=
+ 1 + extra_num_subcompaction_threads_reserved_);
+ }
+ ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
+}
+
+struct RangeWithSize {
+ Range range;
+ uint64_t size;
+
+ RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+ : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+ // The goal is to find some boundary keys so that we can evenly partition
+ // the compaction input data into max_subcompactions ranges.
+ // For every input file, we ask TableReader to estimate 128 anchor points
+ // that evenly partition the input file into 128 ranges and the range
+ // sizes. This can be calculated by scanning index blocks of the file.
+ // Once we have the anchor points for all the input files, we merge them
+ // together and try to find keys dividing ranges evenly.
+ // For example, if we have two input files, and each returns following
+ // ranges:
+ // File1: (a1, 1000), (b1, 1200), (c1, 1100)
+ // File2: (a2, 1100), (b2, 1000), (c2, 1000)
+ // We total sort the keys to following:
+ // (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
+ // We calculate the total size by adding up all ranges' size, which is 6400.
+ // If we would like to partition into 2 subcompactions, the target of the
+ // range size is 3200. Based on the size, we take "b1" as the partition key
+ // since the first three ranges would hit 3200.
+ //
+ // Note that the ranges are actually overlapping. For example, in the example
+ // above, the range ending with "b1" is overlapping with the range ending with
+ // "b2". So the size 1000+1100+1200 is an underestimation of data size up to
+ // "b1". In extreme cases where we only compact N L0 files, a range can
+ // overlap with N-1 other ranges. Since we requested a relatively large number
+ // (128) of ranges from each input files, even N range overlapping would
+ // cause relatively small inaccuracy.
+
+ auto* c = compact_->compaction;
+ if (c->max_subcompactions() <= 1 &&
+ !(c->immutable_options()->compaction_pri == kRoundRobin &&
+ c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
+ return;
+ }
+ auto* cfd = c->column_family_data();
+ const Comparator* cfd_comparator = cfd->user_comparator();
+ const InternalKeyComparator& icomp = cfd->internal_comparator();
+
+ auto* v = compact_->compaction->input_version();
+ int base_level = v->storage_info()->base_level();
+ InstrumentedMutexUnlock unlock_guard(db_mutex_);
+
+ uint64_t total_size = 0;
+ std::vector<TableReader::Anchor> all_anchors;
+ int start_lvl = c->start_level();
+ int out_lvl = c->output_level();
+
+ for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+ int lvl = c->level(lvl_idx);
+ if (lvl >= start_lvl && lvl <= out_lvl) {
+ const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+ size_t num_files = flevel->num_files;
+
+ if (num_files == 0) {
+ continue;
+ }
+
+ for (size_t i = 0; i < num_files; i++) {
+ FileMetaData* f = flevel->files[i].file_metadata;
+ std::vector<TableReader::Anchor> my_anchors;
+ Status s = cfd->table_cache()->ApproximateKeyAnchors(
+ ReadOptions(), icomp, *f, my_anchors);
+ if (!s.ok() || my_anchors.empty()) {
+ my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
+ }
+ for (auto& ac : my_anchors) {
+ // Can be optimize to avoid this loop.
+ total_size += ac.range_size;
+ }
+
+ all_anchors.insert(all_anchors.end(), my_anchors.begin(),
+ my_anchors.end());
+ }
+ }
+ }
+ // Here we total sort all the anchor points across all files and go through
+ // them in the sorted order to find partitioning boundaries.
+ // Not the most efficient implementation. A much more efficient algorithm
+ // probably exists. But they are more complex. If performance turns out to
+ // be a problem, we can optimize.
+ std::sort(
+ all_anchors.begin(), all_anchors.end(),
+ [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
+ return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) <
+ 0;
+ });
+
+ // Remove duplicated entries from boundaries.
+ all_anchors.erase(
+ std::unique(all_anchors.begin(), all_anchors.end(),
+ [cfd_comparator](TableReader::Anchor& a,
+ TableReader::Anchor& b) -> bool {
+ return cfd_comparator->CompareWithoutTimestamp(
+ a.user_key, b.user_key) == 0;
+ }),
+ all_anchors.end());
+
+ // Get the number of planned subcompactions, may update reserve threads
+ // and update extra_num_subcompaction_threads_reserved_ for round-robin
+ uint64_t num_planned_subcompactions;
+ if (c->immutable_options()->compaction_pri == kRoundRobin &&
+ c->immutable_options()->compaction_style == kCompactionStyleLevel) {
+ // For round-robin compaction prioity, we need to employ more
+ // subcompactions (may exceed the max_subcompaction limit). The extra
+ // subcompactions will be executed using reserved threads and taken into
+ // account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
+
+ // Initialized by the number of input files
+ num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
+ uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
+ if (max_subcompactions_limit < num_planned_subcompactions) {
+ // Assert two pointers are not empty so that we can use extra
+ // subcompactions against db compaction limits
+ assert(bg_bottom_compaction_scheduled_ != nullptr);
+ assert(bg_compaction_scheduled_ != nullptr);
+ // Reserve resources when max_subcompaction is not sufficient
+ AcquireSubcompactionResources(
+ (int)(num_planned_subcompactions - max_subcompactions_limit));
+ // Subcompactions limit changes after acquiring additional resources.
+ // Need to call GetSubcompactionsLimit() again to update the number
+ // of planned subcompactions
+ num_planned_subcompactions =
+ std::min(num_planned_subcompactions, GetSubcompactionsLimit());
+ } else {
+ num_planned_subcompactions = max_subcompactions_limit;
+ }
+ } else {
+ num_planned_subcompactions = GetSubcompactionsLimit();
+ }
+
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
+ &num_planned_subcompactions);
+ if (num_planned_subcompactions == 1) return;
+
+ // Group the ranges into subcompactions
+ uint64_t target_range_size = std::max(
+ total_size / num_planned_subcompactions,
+ MaxFileSizeForLevel(
+ *(c->mutable_cf_options()), out_lvl,
+ c->immutable_options()->compaction_style, base_level,
+ c->immutable_options()->level_compaction_dynamic_level_bytes));
+
+ if (target_range_size >= total_size) {
+ return;
+ }
+
+ uint64_t next_threshold = target_range_size;
+ uint64_t cumulative_size = 0;
+ uint64_t num_actual_subcompactions = 1U;
+ for (TableReader::Anchor& anchor : all_anchors) {
+ cumulative_size += anchor.range_size;
+ if (cumulative_size > next_threshold) {
+ next_threshold += target_range_size;
+ num_actual_subcompactions++;
+ boundaries_.push_back(anchor.user_key);
+ }
+ if (num_actual_subcompactions == num_planned_subcompactions) {
+ break;
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
+ &num_actual_subcompactions);
+ // Shrink extra subcompactions resources when extra resrouces are acquired
+ ShrinkSubcompactionResources(
+ std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
+ extra_num_subcompaction_threads_reserved_));
+}
+
+Status CompactionJob::Run() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+ TEST_SYNC_POINT("CompactionJob::Run():Start");
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+
+ const size_t num_threads = compact_->sub_compact_states.size();
+ assert(num_threads > 0);
+ const uint64_t start_micros = db_options_.clock->NowMicros();
+
+ // Launch a thread for each of subcompactions 1...num_threads-1
+ std::vector<port::Thread> thread_pool;
+ thread_pool.reserve(num_threads - 1);
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+ &compact_->sub_compact_states[i]);
+ }
+
+ // Always schedule the first subcompaction (whether or not there are also
+ // others) in the current thread to be efficient with resources
+ ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+ // Wait for all other threads (if there are any) to finish execution
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+
+ compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
+
+ for (auto& state : compact_->sub_compact_states) {
+ compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
+ state.RemoveLastEmptyOutput();
+ }
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME,
+ compaction_stats_.stats.micros);
+ RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+ compaction_stats_.stats.cpu_micros);
+
+ TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+ // Check if any thread encountered an error during execution
+ Status status;
+ IOStatus io_s;
+ bool wrote_new_blob_files = false;
+
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ io_s = state.io_status;
+ break;
+ }
+
+ if (state.Current().HasBlobFileAdditions()) {
+ wrote_new_blob_files = true;
+ }
+ }
+
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ if (status.ok()) {
+ constexpr IODebugContext* dbg = nullptr;
+
+ if (output_directory_) {
+ io_s = output_directory_->FsyncWithDirOptions(
+ IOOptions(), dbg,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+
+ if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+ blob_output_directory_ != output_directory_) {
+ io_s = blob_output_directory_->FsyncWithDirOptions(
+ IOOptions(), dbg,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+ }
+ }
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ if (status.ok()) {
+ status = io_s;
+ }
+ if (status.ok()) {
+ thread_pool.clear();
+ std::vector<const CompactionOutputs::Output*> files_output;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.GetOutputs()) {
+ files_output.emplace_back(&output);
+ }
+ }
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ auto& prefix_extractor =
+ compact_->compaction->mutable_cf_options()->prefix_extractor;
+ std::atomic<size_t> next_file_idx(0);
+ auto verify_table = [&](Status& output_status) {
+ while (true) {
+ size_t file_idx = next_file_idx.fetch_add(1);
+ if (file_idx >= files_output.size()) {
+ break;
+ }
+ // Verify that the table is usable
+ // We set for_compaction to false and don't
+ // OptimizeForCompactionTableRead here because this is a special case
+ // after we finish the table building No matter whether
+ // use_direct_io_for_flush_and_compaction is true, we will regard this
+ // verification as user reads since the goal is to cache it here for
+ // further user reads
+ ReadOptions read_options;
+ InternalIterator* iter = cfd->table_cache()->NewIterator(
+ read_options, file_options_, cfd->internal_comparator(),
+ files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
+ prefix_extractor,
+ /*table_reader_ptr=*/nullptr,
+ cfd->internal_stats()->GetFileReadHist(
+ compact_->compaction->output_level()),
+ TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+ /*skip_filters=*/false, compact_->compaction->output_level(),
+ MaxFileSizeForL0MetaPin(
+ *compact_->compaction->mutable_cf_options()),
+ /*smallest_compaction_key=*/nullptr,
+ /*largest_compaction_key=*/nullptr,
+ /*allow_unprepared_value=*/false);
+ auto s = iter->status();
+
+ if (s.ok() && paranoid_file_checks_) {
+ OutputValidator validator(cfd->internal_comparator(),
+ /*_enable_order_check=*/true,
+ /*_enable_hash=*/true);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ s = validator.Add(iter->key(), iter->value());
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ s = iter->status();
+ }
+ if (s.ok() &&
+ !validator.CompareValidator(files_output[file_idx]->validator)) {
+ s = Status::Corruption("Paranoid checksums do not match");
+ }
+ }
+
+ delete iter;
+
+ if (!s.ok()) {
+ output_status = s;
+ break;
+ }
+ }
+ };
+ for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+ thread_pool.emplace_back(
+ verify_table, std::ref(compact_->sub_compact_states[i].status));
+ }
+ verify_table(compact_->sub_compact_states[0].status);
+ for (auto& thread : thread_pool) {
+ thread.join();
+ }
+
+ for (const auto& state : compact_->sub_compact_states) {
+ if (!state.status.ok()) {
+ status = state.status;
+ break;
+ }
+ }
+ }
+
+ ReleaseSubcompactionResources();
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
+
+ TablePropertiesCollection tp;
+ for (const auto& state : compact_->sub_compact_states) {
+ for (const auto& output : state.GetOutputs()) {
+ auto fn =
+ TableFileName(state.compaction->immutable_options()->cf_paths,
+ output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+ tp[fn] = output.table_properties;
+ }
+ }
+ compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+ // Finish up all book-keeping to unify the subcompaction results
+ compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+ UpdateCompactionStats();
+
+ RecordCompactionIOStats();
+ LogFlush(db_options_.info_log);
+ TEST_SYNC_POINT("CompactionJob::Run():End");
+
+ compact_->status = status;
+ return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+ assert(compact_);
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_INSTALL);
+ db_mutex_->AssertHeld();
+ Status status = compact_->status;
+
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ assert(cfd);
+
+ int output_level = compact_->compaction->output_level();
+ cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_,
+ compaction_stats_);
+
+ if (status.ok()) {
+ status = InstallCompactionResults(mutable_cf_options);
+ }
+ if (!versions_->io_status().ok()) {
+ io_status_ = versions_->io_status();
+ }
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ auto vstorage = cfd->current()->storage_info();
+ const auto& stats = compaction_stats_.stats;
+
+ double read_write_amp = 0.0;
+ double write_amp = 0.0;
+ double bytes_read_per_sec = 0;
+ double bytes_written_per_sec = 0;
+
+ const uint64_t bytes_read_non_output_and_blob =
+ stats.bytes_read_non_output_levels + stats.bytes_read_blob;
+ const uint64_t bytes_read_all =
+ stats.bytes_read_output_level + bytes_read_non_output_and_blob;
+ const uint64_t bytes_written_all =
+ stats.bytes_written + stats.bytes_written_blob;
+
+ if (bytes_read_non_output_and_blob > 0) {
+ read_write_amp = (bytes_written_all + bytes_read_all) /
+ static_cast<double>(bytes_read_non_output_and_blob);
+ write_amp =
+ bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob);
+ }
+ if (stats.micros > 0) {
+ bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros);
+ bytes_written_per_sec =
+ bytes_written_all / static_cast<double>(stats.micros);
+ }
+
+ const std::string& column_family_name = cfd->GetName();
+
+ constexpr double kMB = 1048576.0;
+
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+ "files in(%d, %d) out(%d +%d blob) "
+ "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), "
+ "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64
+ ", records dropped: %" PRIu64 " output_compression: %s\n",
+ column_family_name.c_str(), vstorage->LevelSummary(&tmp),
+ bytes_read_per_sec, bytes_written_per_sec,
+ compact_->compaction->output_level(),
+ stats.num_input_files_in_non_output_levels,
+ stats.num_input_files_in_output_level, stats.num_output_files,
+ stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB,
+ stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB,
+ stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp,
+ write_amp, status.ToString().c_str(), stats.num_input_records,
+ stats.num_dropped_records,
+ CompressionTypeToString(compact_->compaction->output_compression())
+ .c_str());
+
+ const auto& blob_files = vstorage->GetBlobFiles();
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ assert(blob_files.back());
+
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+ column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+ blob_files.back()->GetBlobFileNumber());
+ }
+
+ if (compaction_stats_.has_penultimate_level_output) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] has Penultimate Level output: %" PRIu64
+ ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64,
+ column_family_name.c_str(),
+ compaction_stats_.penultimate_level_stats.bytes_written,
+ compact_->compaction->GetPenultimateLevel(),
+ compaction_stats_.penultimate_level_stats.num_output_files,
+ compaction_stats_.penultimate_level_stats.num_output_records);
+ }
+
+ UpdateCompactionJobStats(stats);
+
+ auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
+ stream << "job" << job_id_ << "event"
+ << "compaction_finished"
+ << "compaction_time_micros" << stats.micros
+ << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+ << compact_->compaction->output_level() << "num_output_files"
+ << stats.num_output_files << "total_output_size"
+ << stats.bytes_written;
+
+ if (stats.num_output_files_blob > 0) {
+ stream << "num_blob_output_files" << stats.num_output_files_blob
+ << "total_blob_output_size" << stats.bytes_written_blob;
+ }
+
+ stream << "num_input_records" << stats.num_input_records
+ << "num_output_records" << stats.num_output_records
+ << "num_subcompactions" << compact_->sub_compact_states.size()
+ << "output_compression"
+ << CompressionTypeToString(compact_->compaction->output_compression());
+
+ stream << "num_single_delete_mismatches"
+ << compaction_job_stats_->num_single_del_mismatch;
+ stream << "num_single_delete_fallthrough"
+ << compaction_job_stats_->num_single_del_fallthru;
+
+ if (measure_io_stats_) {
+ stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+ stream << "file_range_sync_nanos"
+ << compaction_job_stats_->file_range_sync_nanos;
+ stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+ stream << "file_prepare_write_nanos"
+ << compaction_job_stats_->file_prepare_write_nanos;
+ }
+
+ stream << "lsm_state";
+ stream.StartArray();
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ stream << vstorage->NumLevelFiles(level);
+ }
+ stream.EndArray();
+
+ if (!blob_files.empty()) {
+ assert(blob_files.front());
+ stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+ assert(blob_files.back());
+ stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+ }
+
+ if (compaction_stats_.has_penultimate_level_output) {
+ InternalStats::CompactionStats& pl_stats =
+ compaction_stats_.penultimate_level_stats;
+ stream << "penultimate_level_num_output_files" << pl_stats.num_output_files;
+ stream << "penultimate_level_bytes_written" << pl_stats.bytes_written;
+ stream << "penultimate_level_num_output_records"
+ << pl_stats.num_output_records;
+ stream << "penultimate_level_num_output_files_blob"
+ << pl_stats.num_output_files_blob;
+ stream << "penultimate_level_bytes_written_blob"
+ << pl_stats.bytes_written_blob;
+ }
+
+ CleanupCompaction();
+ return status;
+}
+
+void CompactionJob::NotifyOnSubcompactionBegin(
+ SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+ Compaction* c = compact_->compaction;
+
+ if (db_options_.listeners.empty()) {
+ return;
+ }
+ if (shutting_down_->load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_canceled_.load(std::memory_order_acquire)) {
+ return;
+ }
+
+ sub_compact->notify_on_subcompaction_completion = true;
+
+ SubcompactionJobInfo info{};
+ sub_compact->BuildSubcompactionJobInfo(info);
+ info.job_id = static_cast<int>(job_id_);
+ info.thread_id = env_->GetThreadID();
+
+ for (const auto& listener : db_options_.listeners) {
+ listener->OnSubcompactionBegin(info);
+ }
+ info.status.PermitUncheckedError();
+
+#else
+ (void)sub_compact;
+#endif // ROCKSDB_LITE
+}
+
+void CompactionJob::NotifyOnSubcompactionCompleted(
+ SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+
+ if (db_options_.listeners.empty()) {
+ return;
+ }
+ if (shutting_down_->load(std::memory_order_acquire)) {
+ return;
+ }
+
+ if (sub_compact->notify_on_subcompaction_completion == false) {
+ return;
+ }
+
+ SubcompactionJobInfo info{};
+ sub_compact->BuildSubcompactionJobInfo(info);
+ info.job_id = static_cast<int>(job_id_);
+ info.thread_id = env_->GetThreadID();
+
+ for (const auto& listener : db_options_.listeners) {
+ listener->OnSubcompactionCompleted(info);
+ }
+#else
+ (void)sub_compact;
+#endif // ROCKSDB_LITE
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+ assert(sub_compact);
+ assert(sub_compact->compaction);
+
+#ifndef ROCKSDB_LITE
+ if (db_options_.compaction_service) {
+ CompactionServiceJobStatus comp_status =
+ ProcessKeyValueCompactionWithCompactionService(sub_compact);
+ if (comp_status == CompactionServiceJobStatus::kSuccess ||
+ comp_status == CompactionServiceJobStatus::kFailure) {
+ return;
+ }
+ // fallback to local compaction
+ assert(comp_status == CompactionServiceJobStatus::kUseLocal);
+ }
+#endif // !ROCKSDB_LITE
+
+ uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+ // Create compaction filter and fail the compaction if
+ // IgnoreSnapshots() = false because it is not supported anymore
+ const CompactionFilter* compaction_filter =
+ cfd->ioptions()->compaction_filter;
+ std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+ if (compaction_filter == nullptr) {
+ compaction_filter_from_factory =
+ sub_compact->compaction->CreateCompactionFilter();
+ compaction_filter = compaction_filter_from_factory.get();
+ }
+ if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+ sub_compact->status = Status::NotSupported(
+ "CompactionFilter::IgnoreSnapshots() = false is not supported "
+ "anymore.");
+ return;
+ }
+
+ NotifyOnSubcompactionBegin(sub_compact);
+
+ auto range_del_agg = std::make_unique<CompactionRangeDelAggregator>(
+ &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_,
+ &trim_ts_);
+
+ // TODO: since we already use C++17, should use
+ // std::optional<const Slice> instead.
+ const std::optional<Slice> start = sub_compact->start;
+ const std::optional<Slice> end = sub_compact->end;
+
+ std::optional<Slice> start_without_ts;
+ std::optional<Slice> end_without_ts;
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+ read_options.fill_cache = false;
+ read_options.rate_limiter_priority = GetRateLimiterPriority();
+ // Compaction iterators shouldn't be confined to a single prefix.
+ // Compactions use Seek() for
+ // (a) concurrent compactions,
+ // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+ read_options.total_order_seek = true;
+
+ // Remove the timestamps from boundaries because boundaries created in
+ // GenSubcompactionBoundaries doesn't strip away the timestamp.
+ size_t ts_sz = cfd->user_comparator()->timestamp_size();
+ if (start.has_value()) {
+ read_options.iterate_lower_bound = &start.value();
+ if (ts_sz > 0) {
+ start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
+ read_options.iterate_lower_bound = &start_without_ts.value();
+ }
+ }
+ if (end.has_value()) {
+ read_options.iterate_upper_bound = &end.value();
+ if (ts_sz > 0) {
+ end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
+ read_options.iterate_upper_bound = &end_without_ts.value();
+ }
+ }
+
+ // Although the v2 aggregator is what the level iterator(s) know about,
+ // the AddTombstones calls will be propagated down to the v1 aggregator.
+ std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
+ read_options, sub_compact->compaction, range_del_agg.get(),
+ file_options_for_read_, start, end));
+ InternalIterator* input = raw_input.get();
+
+ IterKey start_ikey;
+ IterKey end_ikey;
+ Slice start_slice;
+ Slice end_slice;
+
+ static constexpr char kMaxTs[] =
+ "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ Slice ts_slice;
+ std::string max_ts;
+ if (ts_sz > 0) {
+ if (ts_sz <= strlen(kMaxTs)) {
+ ts_slice = Slice(kMaxTs, ts_sz);
+ } else {
+ max_ts = std::string(ts_sz, '\xff');
+ ts_slice = Slice(max_ts);
+ }
+ }
+
+ if (start.has_value()) {
+ start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
+ kValueTypeForSeek);
+ if (ts_sz > 0) {
+ start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+ &ts_slice);
+ }
+ start_slice = start_ikey.GetInternalKey();
+ }
+ if (end.has_value()) {
+ end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+ if (ts_sz > 0) {
+ end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+ &ts_slice);
+ }
+ end_slice = end_ikey.GetInternalKey();
+ }
+
+ std::unique_ptr<InternalIterator> clip;
+ if (start.has_value() || end.has_value()) {
+ clip = std::make_unique<ClippingIterator>(
+ raw_input.get(), start.has_value() ? &start_slice : nullptr,
+ end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
+ input = clip.get();
+ }
+
+ std::unique_ptr<InternalIterator> blob_counter;
+
+ if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
+ BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
+ blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
+ input = blob_counter.get();
+ }
+
+ std::unique_ptr<InternalIterator> trim_history_iter;
+ if (ts_sz > 0 && !trim_ts_.empty()) {
+ trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
+ input, cfd->user_comparator(), trim_ts_);
+ input = trim_history_iter.get();
+ }
+
+ input->SeekToFirst();
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+ // I/O measurement variables
+ PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+ const uint64_t kRecordStatsEvery = 1000;
+ uint64_t prev_write_nanos = 0;
+ uint64_t prev_fsync_nanos = 0;
+ uint64_t prev_range_sync_nanos = 0;
+ uint64_t prev_prepare_write_nanos = 0;
+ uint64_t prev_cpu_write_nanos = 0;
+ uint64_t prev_cpu_read_nanos = 0;
+ if (measure_io_stats_) {
+ prev_perf_level = GetPerfLevel();
+ SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+ prev_write_nanos = IOSTATS(write_nanos);
+ prev_fsync_nanos = IOSTATS(fsync_nanos);
+ prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+ prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+ prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+ }
+
+ MergeHelper merge(
+ env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
+ compaction_filter, db_options_.info_log.get(),
+ false /* internal key corruption is expected */,
+ existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+ snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+
+ const MutableCFOptions* mutable_cf_options =
+ sub_compact->compaction->mutable_cf_options();
+ assert(mutable_cf_options);
+
+ std::vector<std::string> blob_file_paths;
+
+ // TODO: BlobDB to support output_to_penultimate_level compaction, which needs
+ // 2 builders, so may need to move to `CompactionOutputs`
+ std::unique_ptr<BlobFileBuilder> blob_file_builder(
+ (mutable_cf_options->enable_blob_files &&
+ sub_compact->compaction->output_level() >=
+ mutable_cf_options->blob_file_starting_level)
+ ? new BlobFileBuilder(
+ versions_, fs_.get(),
+ sub_compact->compaction->immutable_options(),
+ mutable_cf_options, &file_options_, db_id_, db_session_id_,
+ job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW,
+ write_hint_, io_tracer_, blob_callback_,
+ BlobFileCreationReason::kCompaction, &blob_file_paths,
+ sub_compact->Current().GetBlobFileAdditionsPtr())
+ : nullptr);
+
+ TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:1",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+
+ const std::string* const full_history_ts_low =
+ full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
+ const SequenceNumber job_snapshot_seq =
+ job_context_ ? job_context_->GetJobSnapshotSequence()
+ : kMaxSequenceNumber;
+
+ auto c_iter = std::make_unique<CompactionIterator>(
+ input, cfd->user_comparator(), &merge, versions_->LastSequence(),
+ &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq,
+ snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_),
+ /*expect_valid_internal_key=*/true, range_del_agg.get(),
+ blob_file_builder.get(), db_options_.allow_data_in_errors,
+ db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
+ sub_compact->compaction, compaction_filter, shutting_down_,
+ db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
+ preclude_last_level_min_seqno_);
+ c_iter->SeekToFirst();
+
+ // Assign range delete aggregator to the target output level, which makes sure
+ // it only output to single level
+ sub_compact->AssignRangeDelAggregator(std::move(range_del_agg));
+
+ const auto& c_iter_stats = c_iter->iter_stats();
+
+ // define the open and close functions for the compaction files, which will be
+ // used open/close output files when needed.
+ const CompactionFileOpenFunc open_file_func =
+ [this, sub_compact](CompactionOutputs& outputs) {
+ return this->OpenCompactionOutputFile(sub_compact, outputs);
+ };
+ const CompactionFileCloseFunc close_file_func =
+ [this, sub_compact](CompactionOutputs& outputs, const Status& status,
+ const Slice& next_table_min_key) {
+ return this->FinishCompactionOutputFile(status, sub_compact, outputs,
+ next_table_min_key);
+ };
+
+ Status status;
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::ProcessKeyValueCompaction()::Processing",
+ reinterpret_cast<void*>(
+ const_cast<Compaction*>(sub_compact->compaction)));
+ while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+ // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+ // returns true.
+
+ assert(!end.has_value() || cfd->user_comparator()->Compare(
+ c_iter->user_key(), end.value()) < 0);
+
+ if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+ kRecordStatsEvery - 1) {
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ c_iter->ResetRecordCounts();
+ RecordCompactionIOStats();
+ }
+
+ // Add current compaction_iterator key to target compaction output, if the
+ // output file needs to be close or open, it will call the `open_file_func`
+ // and `close_file_func`.
+ // TODO: it would be better to have the compaction file open/close moved
+ // into `CompactionOutputs` which has the output file information.
+ status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func);
+ if (!status.ok()) {
+ break;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "CompactionJob::Run():PausingManualCompaction:2",
+ reinterpret_cast<void*>(
+ const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+ c_iter->Next();
+ if (c_iter->status().IsManualCompactionPaused()) {
+ break;
+ }
+ }
+
+ sub_compact->compaction_job_stats.num_blobs_read =
+ c_iter_stats.num_blobs_read;
+ sub_compact->compaction_job_stats.total_blob_bytes_read =
+ c_iter_stats.total_blob_bytes_read;
+ sub_compact->compaction_job_stats.num_input_deletion_records =
+ c_iter_stats.num_input_deletion_records;
+ sub_compact->compaction_job_stats.num_corrupt_keys =
+ c_iter_stats.num_input_corrupt_records;
+ sub_compact->compaction_job_stats.num_single_del_fallthru =
+ c_iter_stats.num_single_del_fallthru;
+ sub_compact->compaction_job_stats.num_single_del_mismatch =
+ c_iter_stats.num_single_del_mismatch;
+ sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+ c_iter_stats.total_input_raw_key_bytes;
+ sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+ c_iter_stats.total_input_raw_value_bytes;
+
+ RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+ c_iter_stats.total_filter_time);
+
+ if (c_iter_stats.num_blobs_relocated > 0) {
+ RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+ c_iter_stats.num_blobs_relocated);
+ }
+ if (c_iter_stats.total_blob_bytes_relocated > 0) {
+ RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED,
+ c_iter_stats.total_blob_bytes_relocated);
+ }
+
+ RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+ RecordCompactionIOStats();
+
+ if (status.ok() && cfd->IsDropped()) {
+ status =
+ Status::ColumnFamilyDropped("Column family dropped during compaction");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ shutting_down_->load(std::memory_order_relaxed)) {
+ status = Status::ShutdownInProgress("Database shutdown");
+ }
+ if ((status.ok() || status.IsColumnFamilyDropped()) &&
+ (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ if (status.ok()) {
+ status = c_iter->status();
+ }
+
+ // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+ // close the output files. Open file function is also passed, in case there's
+ // only range-dels, no file was opened, to save the range-dels, it need to
+ // create a new output file.
+ status = sub_compact->CloseCompactionFiles(status, open_file_func,
+ close_file_func);
+
+ if (blob_file_builder) {
+ if (status.ok()) {
+ status = blob_file_builder->Finish();
+ } else {
+ blob_file_builder->Abandon(status);
+ }
+ blob_file_builder.reset();
+ sub_compact->Current().UpdateBlobStats();
+ }
+
+ sub_compact->compaction_job_stats.cpu_micros =
+ db_options_.clock->CPUMicros() - prev_cpu_micros;
+
+ if (measure_io_stats_) {
+ sub_compact->compaction_job_stats.file_write_nanos +=
+ IOSTATS(write_nanos) - prev_write_nanos;
+ sub_compact->compaction_job_stats.file_fsync_nanos +=
+ IOSTATS(fsync_nanos) - prev_fsync_nanos;
+ sub_compact->compaction_job_stats.file_range_sync_nanos +=
+ IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+ sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+ IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+ sub_compact->compaction_job_stats.cpu_micros -=
+ (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+ IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+ 1000;
+ if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+ SetPerfLevel(prev_perf_level);
+ }
+ }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+ if (!status.ok()) {
+ if (c_iter) {
+ c_iter->status().PermitUncheckedError();
+ }
+ if (input) {
+ input->status().PermitUncheckedError();
+ }
+ }
+#endif // ROCKSDB_ASSERT_STATUS_CHECKED
+
+ blob_counter.reset();
+ clip.reset();
+ raw_input.reset();
+ sub_compact->status = status;
+ NotifyOnSubcompactionCompleted(sub_compact);
+}
+
+uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
+ return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id;
+}
+
+void CompactionJob::RecordDroppedKeys(
+ const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats) {
+ if (c_iter_stats.num_record_drop_user > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+ c_iter_stats.num_record_drop_user);
+ }
+ if (c_iter_stats.num_record_drop_hidden > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+ c_iter_stats.num_record_drop_hidden);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_records_replaced +=
+ c_iter_stats.num_record_drop_hidden;
+ }
+ }
+ if (c_iter_stats.num_record_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+ c_iter_stats.num_record_drop_obsolete);
+ if (compaction_job_stats) {
+ compaction_job_stats->num_expired_deletion_records +=
+ c_iter_stats.num_record_drop_obsolete;
+ }
+ }
+ if (c_iter_stats.num_record_drop_range_del > 0) {
+ RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+ c_iter_stats.num_record_drop_range_del);
+ }
+ if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_range_del_drop_obsolete);
+ }
+ if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+ RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+ c_iter_stats.num_optimized_del_drop_obsolete);
+ }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+ const Status& input_status, SubcompactionState* sub_compact,
+ CompactionOutputs& outputs, const Slice& next_table_min_key) {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+ assert(sub_compact != nullptr);
+ assert(outputs.HasBuilder());
+
+ FileMetaData* meta = outputs.GetMetaData();
+ uint64_t output_number = meta->fd.GetNumber();
+ assert(output_number != 0);
+
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+ std::string file_checksum = kUnknownFileChecksum;
+ std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+ // Check for iterator errors
+ Status s = input_status;
+
+ // Add range tombstones
+ auto earliest_snapshot = kMaxSequenceNumber;
+ if (existing_snapshots_.size() > 0) {
+ earliest_snapshot = existing_snapshots_[0];
+ }
+ if (s.ok()) {
+ CompactionIterationStats range_del_out_stats;
+ // if the compaction supports per_key_placement, only output range dels to
+ // the penultimate level.
+ // Note: Use `bottommost_level_ = true` for both bottommost and
+ // output_to_penultimate_level compaction here, as it's only used to decide
+ // if range dels could be dropped.
+ if (outputs.HasRangeDel()) {
+ s = outputs.AddRangeDels(
+ sub_compact->start.has_value() ? &(sub_compact->start.value())
+ : nullptr,
+ sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
+ range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
+ earliest_snapshot, next_table_min_key, full_history_ts_low_);
+ }
+ RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+ TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
+ }
+
+ const uint64_t current_entries = outputs.NumEntries();
+
+ s = outputs.Finish(s, seqno_time_mapping_);
+
+ if (s.ok()) {
+ // With accurate smallest and largest key, we can get a slightly more
+ // accurate oldest ancester time.
+ // This makes oldest ancester time in manifest more accurate than in
+ // table properties. Not sure how to resolve it.
+ if (meta->smallest.size() > 0 && meta->largest.size() > 0) {
+ uint64_t refined_oldest_ancester_time;
+ Slice new_smallest = meta->smallest.user_key();
+ Slice new_largest = meta->largest.user_key();
+ if (!new_largest.empty() && !new_smallest.empty()) {
+ refined_oldest_ancester_time =
+ sub_compact->compaction->MinInputFileOldestAncesterTime(
+ &(meta->smallest), &(meta->largest));
+ if (refined_oldest_ancester_time !=
+ std::numeric_limits<uint64_t>::max()) {
+ meta->oldest_ancester_time = refined_oldest_ancester_time;
+ }
+ }
+ }
+ }
+
+ // Finish and check for file errors
+ IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_,
+ db_options_.use_fsync);
+
+ if (s.ok() && io_s.ok()) {
+ file_checksum = meta->file_checksum;
+ file_checksum_func_name = meta->file_checksum_func_name;
+ }
+
+ if (s.ok()) {
+ s = io_s;
+ }
+ if (sub_compact->io_status.ok()) {
+ sub_compact->io_status = io_s;
+ // Since this error is really a copy of the
+ // "normal" status, it does not also need to be checked
+ sub_compact->io_status.PermitUncheckedError();
+ }
+
+ TableProperties tp;
+ if (s.ok()) {
+ tp = outputs.GetTableProperties();
+ }
+
+ if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+ // If there is nothing to output, no necessary to generate a sst file.
+ // This happens when the output level is bottom level, at the same time
+ // the sub_compact output nothing.
+ std::string fname =
+ TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
+ meta->fd.GetNumber(), meta->fd.GetPathId());
+
+ // TODO(AR) it is not clear if there are any larger implications if
+ // DeleteFile fails here
+ Status ds = env_->DeleteFile(fname);
+ if (!ds.ok()) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
+ " at bottom level%s",
+ cfd->GetName().c_str(), job_id_, output_number,
+ meta->marked_for_compaction ? " (need compaction)" : "");
+ }
+
+ // Also need to remove the file from outputs, or it will be added to the
+ // VersionEdit.
+ outputs.RemoveLastOutput();
+ meta = nullptr;
+ }
+
+ if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+ // Output to event logger and fire events.
+ outputs.UpdateTableProperties();
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+ " keys, %" PRIu64 " bytes%s, temperature: %s",
+ cfd->GetName().c_str(), job_id_, output_number,
+ current_entries, meta->fd.file_size,
+ meta->marked_for_compaction ? " (need compaction)" : "",
+ temperature_to_string[meta->temperature].c_str());
+ }
+ std::string fname;
+ FileDescriptor output_fd;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ Status status_for_listener = s;
+ if (meta != nullptr) {
+ fname = GetTableFileName(meta->fd.GetNumber());
+ output_fd = meta->fd;
+ oldest_blob_file_number = meta->oldest_blob_file_number;
+ } else {
+ fname = "(nil)";
+ if (s.ok()) {
+ status_for_listener = Status::Aborted("Empty SST file not kept");
+ }
+ }
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+ job_id_, output_fd, oldest_blob_file_number, tp,
+ TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
+ file_checksum_func_name);
+
+#ifndef ROCKSDB_LITE
+ // Report new file to SstFileManagerImpl
+ auto sfm =
+ static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+ if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+ Status add_s = sfm->OnAddFile(fname);
+ if (!add_s.ok() && s.ok()) {
+ s = add_s;
+ }
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ // TODO(ajkr): should we return OK() if max space was reached by the final
+ // compaction output file (similarly to how flush works when full)?
+ s = Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT(
+ "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached");
+ InstrumentedMutexLock l(db_mutex_);
+ db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+ }
+ }
+#endif
+
+ outputs.ResetBuilder();
+ return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+ const MutableCFOptions& mutable_cf_options) {
+ assert(compact_);
+
+ db_mutex_->AssertHeld();
+
+ auto* compaction = compact_->compaction;
+ assert(compaction);
+
+ {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ if (compaction_stats_.has_penultimate_level_output) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64
+ " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
+ compaction->column_family_data()->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary),
+ compaction_stats_.penultimate_level_stats.bytes_written,
+ compaction_stats_.stats.bytes_written,
+ compaction_stats_.TotalBytesWritten());
+ } else {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+ compaction->column_family_data()->GetName().c_str(),
+ job_id_, compaction->InputLevelSummary(&inputs_summary),
+ compaction_stats_.TotalBytesWritten());
+ }
+ }
+
+ VersionEdit* const edit = compaction->edit();
+ assert(edit);
+
+ // Add compaction inputs
+ compaction->AddInputDeletions(edit);
+
+ std::unordered_map<uint64_t, BlobGarbageMeter::BlobStats> blob_total_garbage;
+
+ for (const auto& sub_compact : compact_->sub_compact_states) {
+ sub_compact.AddOutputsEdit(edit);
+
+ for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) {
+ edit->AddBlobFile(blob);
+ }
+
+ if (sub_compact.Current().GetBlobGarbageMeter()) {
+ const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows();
+
+ for (const auto& pair : flows) {
+ const uint64_t blob_file_number = pair.first;
+ const BlobGarbageMeter::BlobInOutFlow& flow = pair.second;
+
+ assert(flow.IsValid());
+ if (flow.HasGarbage()) {
+ blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(),
+ flow.GetGarbageBytes());
+ }
+ }
+ }
+ }
+
+ for (const auto& pair : blob_total_garbage) {
+ const uint64_t blob_file_number = pair.first;
+ const BlobGarbageMeter::BlobStats& stats = pair.second;
+
+ edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(),
+ stats.GetBytes());
+ }
+
+ if ((compaction->compaction_reason() ==
+ CompactionReason::kLevelMaxLevelSize ||
+ compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) &&
+ compaction->immutable_options()->compaction_pri == kRoundRobin) {
+ int start_level = compaction->start_level();
+ if (start_level > 0) {
+ auto vstorage = compaction->input_version()->storage_info();
+ edit->AddCompactCursor(start_level,
+ vstorage->GetNextCompactCursor(
+ start_level, compaction->num_input_files(0)));
+ }
+ }
+
+ return versions_->LogAndApply(compaction->column_family_data(),
+ mutable_cf_options, edit, db_mutex_,
+ db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+ RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+ CompactionReason compaction_reason =
+ compact_->compaction->compaction_reason();
+ if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) {
+ RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written));
+ } else if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written));
+ } else if (compaction_reason == CompactionReason::kTtl) {
+ RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read));
+ RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written));
+ }
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+ IOSTATS_RESET(bytes_read);
+ ThreadStatusUtil::IncreaseThreadOperationProperty(
+ ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+ IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
+ CompactionOutputs& outputs) {
+ assert(sub_compact != nullptr);
+
+ // no need to lock because VersionSet::next_file_number_ is atomic
+ uint64_t file_number = versions_->NewFileNumber();
+ std::string fname = GetTableFileName(file_number);
+ // Fire events.
+ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+ EventHelpers::NotifyTableFileCreationStarted(
+ cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+ TableFileCreationReason::kCompaction);
+#endif // !ROCKSDB_LITE
+ // Make the output file
+ std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+ bool syncpoint_arg = file_options_.use_direct_writes;
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+ &syncpoint_arg);
+#endif
+
+ // Pass temperature of the last level files to FileSystem.
+ FileOptions fo_copy = file_options_;
+ Temperature temperature = sub_compact->compaction->output_temperature();
+ // only set for the last level compaction and also it's not output to
+ // penultimate level (when preclude_last_level feature is enabled)
+ if (temperature == Temperature::kUnknown &&
+ sub_compact->compaction->is_last_level() &&
+ !sub_compact->IsCurrentPenultimateLevel()) {
+ temperature =
+ sub_compact->compaction->mutable_cf_options()->last_level_temperature;
+ }
+ fo_copy.temperature = temperature;
+
+ Status s;
+ IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
+ s = io_s;
+ if (sub_compact->io_status.ok()) {
+ sub_compact->io_status = io_s;
+ // Since this error is really a copy of the io_s that is checked below as s,
+ // it does not also need to be checked.
+ sub_compact->io_status.PermitUncheckedError();
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+ " fails at NewWritableFile with status %s",
+ sub_compact->compaction->column_family_data()->GetName().c_str(),
+ job_id_, file_number, s.ToString().c_str());
+ LogFlush(db_options_.info_log);
+ EventHelpers::LogAndNotifyTableFileCreationFinished(
+ event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+ fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+ TableProperties(), TableFileCreationReason::kCompaction, s,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+ return s;
+ }
+
+ // Try to figure out the output file's oldest ancester time.
+ int64_t temp_current_time = 0;
+ auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time);
+ // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+ if (!get_time_status.ok()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Failed to get current time. Status: %s",
+ get_time_status.ToString().c_str());
+ }
+ uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+ InternalKey tmp_start, tmp_end;
+ if (sub_compact->start.has_value()) {
+ tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
+ }
+ if (sub_compact->end.has_value()) {
+ tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
+ }
+ uint64_t oldest_ancester_time =
+ sub_compact->compaction->MinInputFileOldestAncesterTime(
+ sub_compact->start.has_value() ? &tmp_start : nullptr,
+ sub_compact->end.has_value() ? &tmp_end : nullptr);
+ if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
+ oldest_ancester_time = current_time;
+ }
+
+ // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+ {
+ FileMetaData meta;
+ meta.fd = FileDescriptor(file_number,
+ sub_compact->compaction->output_path_id(), 0);
+ meta.oldest_ancester_time = oldest_ancester_time;
+ meta.file_creation_time = current_time;
+ meta.temperature = temperature;
+ assert(!db_id_.empty());
+ assert(!db_session_id_.empty());
+ s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(),
+ &meta.unique_id);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "[%s] [JOB %d] file #%" PRIu64
+ " failed to generate unique id: %s.",
+ cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(),
+ s.ToString().c_str());
+ return s;
+ }
+
+ outputs.AddOutput(std::move(meta), cfd->internal_comparator(),
+ sub_compact->compaction->mutable_cf_options()
+ ->check_flush_compaction_key_order,
+ paranoid_file_checks_);
+ }
+
+ writable_file->SetIOPriority(GetRateLimiterPriority());
+ writable_file->SetWriteLifeTimeHint(write_hint_);
+ FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
+ writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+ sub_compact->compaction->OutputFilePreallocationSize()));
+ const auto& listeners =
+ sub_compact->compaction->immutable_options()->listeners;
+ outputs.AssignFileWriter(new WritableFileWriter(
+ std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_,
+ db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(),
+ tmp_set.Contains(FileType::kTableFile), false));
+
+ TableBuilderOptions tboptions(
+ *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+ cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+ sub_compact->compaction->output_compression(),
+ sub_compact->compaction->output_compression_opts(), cfd->GetID(),
+ cfd->GetName(), sub_compact->compaction->output_level(),
+ bottommost_level_, TableFileCreationReason::kCompaction,
+ 0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
+ sub_compact->compaction->max_output_file_size(), file_number);
+
+ outputs.NewBuilder(tboptions);
+
+ LogFlush(db_options_.info_log);
+ return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+ for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+ sub_compact.Cleanup(table_cache_.get());
+ }
+ delete compact_;
+ compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+} // namespace
+
+#endif // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+ assert(compact_);
+
+ Compaction* compaction = compact_->compaction;
+ compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
+ compaction_stats_.stats.num_input_files_in_output_level = 0;
+ for (int input_level = 0;
+ input_level < static_cast<int>(compaction->num_input_levels());
+ ++input_level) {
+ if (compaction->level(input_level) != compaction->output_level()) {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.stats.num_input_files_in_non_output_levels,
+ &compaction_stats_.stats.bytes_read_non_output_levels, input_level);
+ } else {
+ UpdateCompactionInputStatsHelper(
+ &compaction_stats_.stats.num_input_files_in_output_level,
+ &compaction_stats_.stats.bytes_read_output_level, input_level);
+ }
+ }
+
+ assert(compaction_job_stats_);
+ compaction_stats_.stats.bytes_read_blob =
+ compaction_job_stats_->total_blob_bytes_read;
+
+ compaction_stats_.stats.num_dropped_records =
+ compaction_stats_.DroppedRecords();
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+ uint64_t* bytes_read,
+ int input_level) {
+ const Compaction* compaction = compact_->compaction;
+ auto num_input_files = compaction->num_input_files(input_level);
+ *num_files += static_cast<int>(num_input_files);
+
+ for (size_t i = 0; i < num_input_files; ++i) {
+ const auto* file_meta = compaction->input(input_level, i);
+ *bytes_read += file_meta->fd.GetFileSize();
+ compaction_stats_.stats.num_input_records +=
+ static_cast<uint64_t>(file_meta->num_entries);
+ }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+ compaction_job_stats_->elapsed_micros = stats.micros;
+
+ // input information
+ compaction_job_stats_->total_input_bytes =
+ stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+ compaction_job_stats_->num_input_records = stats.num_input_records;
+ compaction_job_stats_->num_input_files =
+ stats.num_input_files_in_non_output_levels +
+ stats.num_input_files_in_output_level;
+ compaction_job_stats_->num_input_files_at_output_level =
+ stats.num_input_files_in_output_level;
+
+ // output information
+ compaction_job_stats_->total_output_bytes = stats.bytes_written;
+ compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
+ compaction_job_stats_->num_output_records = stats.num_output_records;
+ compaction_job_stats_->num_output_files = stats.num_output_files;
+ compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+
+ if (stats.num_output_files > 0) {
+ CopyPrefix(compact_->SmallestUserKey(),
+ CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->smallest_output_key_prefix);
+ CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
+ &compaction_job_stats_->largest_output_key_prefix);
+ }
+#else
+ (void)stats;
+#endif // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+ Compaction* compaction = compact_->compaction;
+ ColumnFamilyData* cfd = compaction->column_family_data();
+
+ // Let's check if anything will get logged. Don't prepare all the info if
+ // we're not logging
+ if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+ Compaction::InputLevelSummaryBuffer inputs_summary;
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+ cfd->GetName().c_str(), job_id_,
+ compaction->InputLevelSummary(&inputs_summary), compaction->score());
+ char scratch[2345];
+ compaction->Summary(scratch, sizeof(scratch));
+ ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n",
+ cfd->GetName().c_str(), scratch);
+ // build event logger report
+ auto stream = event_logger_->Log();
+ stream << "job" << job_id_ << "event"
+ << "compaction_started"
+ << "compaction_reason"
+ << GetCompactionReasonString(compaction->compaction_reason());
+ for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+ stream << ("files_L" + std::to_string(compaction->level(i)));
+ stream.StartArray();
+ for (auto f : *compaction->inputs(i)) {
+ stream << f->fd.GetNumber();
+ }
+ stream.EndArray();
+ }
+ stream << "score" << compaction->score() << "input_data_size"
+ << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno"
+ << (existing_snapshots_.empty()
+ ? int64_t{-1} // Use -1 for "none"
+ : static_cast<int64_t>(existing_snapshots_[0]));
+ if (compaction->SupportsPerKeyPlacement()) {
+ stream << "preclude_last_level_min_seqno"
+ << preclude_last_level_min_seqno_;
+ stream << "penultimate_output_level" << compaction->GetPenultimateLevel();
+ stream << "penultimate_output_range"
+ << GetCompactionPenultimateOutputRangeTypeString(
+ compaction->GetPenultimateOutputRangeType());
+
+ if (compaction->GetPenultimateOutputRangeType() ==
+ Compaction::PenultimateOutputRangeType::kDisabled) {
+ ROCKS_LOG_WARN(
+ db_options_.info_log,
+ "[%s] [JOB %d] Penultimate level output is disabled, likely "
+ "because of the range conflict in the penultimate level",
+ cfd->GetName().c_str(), job_id_);
+ }
+ }
+ }
+}
+
+std::string CompactionJob::GetTableFileName(uint64_t file_number) {
+ return TableFileName(compact_->compaction->immutable_options()->cf_paths,
+ file_number, compact_->compaction->output_path_id());
+}
+
+Env::IOPriority CompactionJob::GetRateLimiterPriority() {
+ if (versions_ && versions_->GetColumnFamilySet() &&
+ versions_->GetColumnFamilySet()->write_controller()) {
+ WriteController* write_controller =
+ versions_->GetColumnFamilySet()->write_controller();
+ if (write_controller->NeedsDelay() || write_controller->IsStopped()) {
+ return Env::IO_USER;
+ }
+ }
+
+ return Env::IO_LOW;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..bfbce1011
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,500 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class CompactionState;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class SystemClock;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class SubcompactionState;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+//
+// CompactionJob has 2 main stats:
+// 1. CompactionJobStats compaction_job_stats_
+// CompactionJobStats is a public data structure which is part of Compaction
+// event listener that rocksdb share the job stats with the user.
+// Internally it's an aggregation of all the compaction_job_stats from each
+// `SubcompactionState`:
+// +------------------------+
+// | SubcompactionState |
+// | |
+// +--------->| compaction_job_stats |
+// | | |
+// | +------------------------+
+// +------------------------+ |
+// | CompactionJob | | +------------------------+
+// | | | | SubcompactionState |
+// | compaction_job_stats +-----+ | |
+// | | +--------->| compaction_job_stats |
+// | | | | |
+// +------------------------+ | +------------------------+
+// |
+// | +------------------------+
+// | | SubcompactionState |
+// | | |
+// +--------->+ compaction_job_stats |
+// | | |
+// | +------------------------+
+// |
+// | +------------------------+
+// | | ... |
+// +--------->+ |
+// +------------------------+
+//
+// 2. CompactionStatsFull compaction_stats_
+// `CompactionStatsFull` is an internal stats about the compaction, which
+// is eventually sent to `ColumnFamilyData::internal_stats_` and used for
+// logging and public metrics.
+// Internally, it's an aggregation of stats_ from each `SubcompactionState`.
+// It has 2 parts, normal stats about the main compaction information and
+// the penultimate level output stats.
+// `SubcompactionState` maintains the CompactionOutputs for normal output and
+// the penultimate level output if exists, the per_level stats is
+// stored with the outputs.
+// +---------------------------+
+// | SubcompactionState |
+// | |
+// | +----------------------+ |
+// | | CompactionOutputs | |
+// | | (normal output) | |
+// +---->| stats_ | |
+// | | +----------------------+ |
+// | | |
+// | | +----------------------+ |
+// +--------------------------------+ | | | CompactionOutputs | |
+// | CompactionJob | | | | (penultimate_level) | |
+// | | +--------->| stats_ | |
+// | compaction_stats_ | | | | +----------------------+ |
+// | +-------------------------+ | | | | |
+// | |stats (normal) |------|----+ +---------------------------+
+// | +-------------------------+ | | |
+// | | | |
+// | +-------------------------+ | | | +---------------------------+
+// | |penultimate_level_stats +------+ | | SubcompactionState |
+// | +-------------------------+ | | | | |
+// | | | | | +----------------------+ |
+// | | | | | | CompactionOutputs | |
+// +--------------------------------+ | | | | (normal output) | |
+// | +---->| stats_ | |
+// | | +----------------------+ |
+// | | |
+// | | +----------------------+ |
+// | | | CompactionOutputs | |
+// | | | (penultimate_level) | |
+// +--------->| stats_ | |
+// | +----------------------+ |
+// | |
+// +---------------------------+
+
+class CompactionJob {
+ public:
+ CompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+ FSDirectory* db_directory, FSDirectory* output_directory,
+ FSDirectory* blob_output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ SequenceNumber earliest_write_conflict_snapshot,
+ const SnapshotChecker* snapshot_checker, JobContext* job_context,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ bool paranoid_file_checks, bool measure_io_stats,
+ const std::string& dbname, CompactionJobStats* compaction_job_stats,
+ Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id = "", const std::string& db_session_id = "",
+ std::string full_history_ts_low = "", std::string trim_ts = "",
+ BlobFileCompletionCallback* blob_callback = nullptr,
+ int* bg_compaction_scheduled = nullptr,
+ int* bg_bottom_compaction_scheduled = nullptr);
+
+ virtual ~CompactionJob();
+
+ // no copy/move
+ CompactionJob(CompactionJob&& job) = delete;
+ CompactionJob(const CompactionJob& job) = delete;
+ CompactionJob& operator=(const CompactionJob& job) = delete;
+
+ // REQUIRED: mutex held
+ // Prepare for the compaction by setting up boundaries for each subcompaction
+ void Prepare();
+ // REQUIRED mutex not held
+ // Launch threads for each subcompaction and wait for them to finish. After
+ // that, verify table is usable and finally do bookkeeping to unify
+ // subcompaction results
+ Status Run();
+
+ // REQUIRED: mutex held
+ // Add compaction input/output to the current version
+ Status Install(const MutableCFOptions& mutable_cf_options);
+
+ // Return the IO status
+ IOStatus io_status() const { return io_status_; }
+
+ protected:
+ void UpdateCompactionStats();
+ void LogCompaction();
+ virtual void RecordCompactionIOStats();
+ void CleanupCompaction();
+
+ // Call compaction filter. Then iterate through input and compact the
+ // kv-pairs
+ void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+ CompactionState* compact_;
+ InternalStats::CompactionStatsFull compaction_stats_;
+ const ImmutableDBOptions& db_options_;
+ const MutableDBOptions mutable_db_options_copy_;
+ LogBuffer* log_buffer_;
+ FSDirectory* output_directory_;
+ Statistics* stats_;
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level_;
+
+ Env::WriteLifeTimeHint write_hint_;
+
+ IOStatus io_status_;
+
+ CompactionJobStats* compaction_job_stats_;
+
+ private:
+ friend class CompactionJobTestBase;
+
+ // Generates a histogram representing potential divisions of key ranges from
+ // the input. It adds the starting and/or ending keys of certain input files
+ // to the working set and then finds the approximate size of data in between
+ // each consecutive pair of slices. Then it divides these ranges into
+ // consecutive groups such that each group has a similar size.
+ void GenSubcompactionBoundaries();
+
+ // Get the number of planned subcompactions based on max_subcompactions and
+ // extra reserved resources
+ uint64_t GetSubcompactionsLimit();
+
+ // Additional reserved threads are reserved and the number is stored in
+ // extra_num_subcompaction_threads_reserved__. For now, this happens only if
+ // the compaction priority is round-robin and max_subcompactions is not
+ // sufficient (extra resources may be needed)
+ void AcquireSubcompactionResources(int num_extra_required_subcompactions);
+
+ // Additional threads may be reserved during IncreaseSubcompactionResources()
+ // if num_actual_subcompactions is less than num_planned_subcompactions.
+ // Additional threads will be released and the bg_compaction_scheduled_ or
+ // bg_bottom_compaction_scheduled_ will be updated if they are used.
+ // DB Mutex lock is required.
+ void ShrinkSubcompactionResources(uint64_t num_extra_resources);
+
+ // Release all reserved threads and update the compaction limits.
+ void ReleaseSubcompactionResources();
+
+ CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
+ SubcompactionState* sub_compact);
+
+ // update the thread status for starting a compaction.
+ void ReportStartedCompaction(Compaction* compaction);
+
+ Status FinishCompactionOutputFile(const Status& input_status,
+ SubcompactionState* sub_compact,
+ CompactionOutputs& outputs,
+ const Slice& next_table_min_key);
+ Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+ Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
+ CompactionOutputs& outputs);
+ void UpdateCompactionJobStats(
+ const InternalStats::CompactionStats& stats) const;
+ void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+ CompactionJobStats* compaction_job_stats = nullptr);
+
+ void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
+ int input_level);
+
+ void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
+
+ void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
+
+ uint32_t job_id_;
+
+ // DBImpl state
+ const std::string& dbname_;
+ const std::string db_id_;
+ const std::string db_session_id_;
+ const FileOptions file_options_;
+
+ Env* env_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ FileSystemPtr fs_;
+ // env_option optimized for compaction table reads
+ FileOptions file_options_for_read_;
+ VersionSet* versions_;
+ const std::atomic<bool>* shutting_down_;
+ const std::atomic<bool>& manual_compaction_canceled_;
+ FSDirectory* db_directory_;
+ FSDirectory* blob_output_directory_;
+ InstrumentedMutex* db_mutex_;
+ ErrorHandler* db_error_handler_;
+ // If there were two snapshots with seq numbers s1 and
+ // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+ // entirely within s1 and s2, then the earlier version of k1 can be safely
+ // deleted because that version is not visible in any snapshot.
+ std::vector<SequenceNumber> existing_snapshots_;
+
+ // This is the earliest snapshot that could be used for write-conflict
+ // checking by a transaction. For any user-key newer than this snapshot, we
+ // should make sure not to remove evidence that a write occurred.
+ SequenceNumber earliest_write_conflict_snapshot_;
+
+ const SnapshotChecker* const snapshot_checker_;
+
+ JobContext* job_context_;
+
+ std::shared_ptr<Cache> table_cache_;
+
+ EventLogger* event_logger_;
+
+ bool paranoid_file_checks_;
+ bool measure_io_stats_;
+ // Stores the Slices that designate the boundaries for each subcompaction
+ std::vector<std::string> boundaries_;
+ Env::Priority thread_pri_;
+ std::string full_history_ts_low_;
+ std::string trim_ts_;
+ BlobFileCompletionCallback* blob_callback_;
+
+ uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
+ // Stores the number of reserved threads in shared env_ for the number of
+ // extra subcompaction in kRoundRobin compaction priority
+ int extra_num_subcompaction_threads_reserved_;
+
+ // Stores the pointer to bg_compaction_scheduled_,
+ // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing
+ // or updating it.
+ int* bg_compaction_scheduled_;
+ int* bg_bottom_compaction_scheduled_;
+
+ // Stores the sequence number to time mapping gathered from all input files
+ // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
+ SeqnoToTimeMapping seqno_time_mapping_;
+
+ // Minimal sequence number for preserving the time information. The time info
+ // older than this sequence number won't be preserved after the compaction and
+ // if it's bottommost compaction, the seq num will be zeroed out.
+ SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+ // Minimal sequence number to preclude the data from the last level. If the
+ // key has bigger (newer) sequence number than this, it will be precluded from
+ // the last level (output to penultimate level).
+ SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+ // Get table file name in where it's outputting to, which should also be in
+ // `output_directory_`.
+ virtual std::string GetTableFileName(uint64_t file_number);
+ // The rate limiter priority (io_priority) is determined dynamically here.
+ // The Compaction Read and Write priorities are the same for different
+ // scenarios, such as write stalled.
+ Env::IOPriority GetRateLimiterPriority();
+};
+
+// CompactionServiceInput is used the pass compaction information between two
+// db instances. It contains the information needed to do a compaction. It
+// doesn't contain the LSM tree information, which is passed though MANIFEST
+// file.
+struct CompactionServiceInput {
+ ColumnFamilyDescriptor column_family;
+
+ DBOptions db_options;
+
+ std::vector<SequenceNumber> snapshots;
+
+ // SST files for compaction, it should already be expended to include all the
+ // files needed for this compaction, for both input level files and output
+ // level files.
+ std::vector<std::string> input_files;
+ int output_level;
+
+ // db_id is used to generate unique id of sst on the remote compactor
+ std::string db_id;
+
+ // information for subcompaction
+ bool has_begin = false;
+ std::string begin;
+ bool has_end = false;
+ std::string end;
+
+ // serialization interface to read and write the object
+ static Status Read(const std::string& data_str, CompactionServiceInput* obj);
+ Status Write(std::string* output);
+
+ // Initialize a dummy ColumnFamilyDescriptor
+ CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
+
+#ifndef NDEBUG
+ bool TEST_Equals(CompactionServiceInput* other);
+ bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
+#endif // NDEBUG
+};
+
+// CompactionServiceOutputFile is the metadata for the output SST file
+struct CompactionServiceOutputFile {
+ std::string file_name;
+ SequenceNumber smallest_seqno;
+ SequenceNumber largest_seqno;
+ std::string smallest_internal_key;
+ std::string largest_internal_key;
+ uint64_t oldest_ancester_time;
+ uint64_t file_creation_time;
+ uint64_t paranoid_hash;
+ bool marked_for_compaction;
+ UniqueId64x2 unique_id;
+
+ CompactionServiceOutputFile() = default;
+ CompactionServiceOutputFile(
+ const std::string& name, SequenceNumber smallest, SequenceNumber largest,
+ std::string _smallest_internal_key, std::string _largest_internal_key,
+ uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+ uint64_t _paranoid_hash, bool _marked_for_compaction,
+ UniqueId64x2 _unique_id)
+ : file_name(name),
+ smallest_seqno(smallest),
+ largest_seqno(largest),
+ smallest_internal_key(std::move(_smallest_internal_key)),
+ largest_internal_key(std::move(_largest_internal_key)),
+ oldest_ancester_time(_oldest_ancester_time),
+ file_creation_time(_file_creation_time),
+ paranoid_hash(_paranoid_hash),
+ marked_for_compaction(_marked_for_compaction),
+ unique_id(std::move(_unique_id)) {}
+};
+
+// CompactionServiceResult contains the compaction result from a different db
+// instance, with these information, the primary db instance with write
+// permission is able to install the result to the DB.
+struct CompactionServiceResult {
+ Status status;
+ std::vector<CompactionServiceOutputFile> output_files;
+ int output_level;
+
+ // location of the output files
+ std::string output_path;
+
+ // some statistics about the compaction
+ uint64_t num_output_records = 0;
+ uint64_t total_bytes = 0;
+ uint64_t bytes_read = 0;
+ uint64_t bytes_written = 0;
+ CompactionJobStats stats;
+
+ // serialization interface to read and write the object
+ static Status Read(const std::string& data_str, CompactionServiceResult* obj);
+ Status Write(std::string* output);
+
+#ifndef NDEBUG
+ bool TEST_Equals(CompactionServiceResult* other);
+ bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch);
+#endif // NDEBUG
+};
+
+// CompactionServiceCompactionJob is an read-only compaction job, it takes
+// input information from `compaction_service_input` and put result information
+// in `compaction_service_result`, the SST files are generated to `output_path`.
+class CompactionServiceCompactionJob : private CompactionJob {
+ public:
+ CompactionServiceCompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options,
+ const FileOptions& file_options, VersionSet* versions,
+ const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+ FSDirectory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id, const std::string& db_session_id,
+ std::string output_path,
+ const CompactionServiceInput& compaction_service_input,
+ CompactionServiceResult* compaction_service_result);
+
+ // Run the compaction in current thread and return the result
+ Status Run();
+
+ void CleanupCompaction();
+
+ IOStatus io_status() const { return CompactionJob::io_status(); }
+
+ protected:
+ void RecordCompactionIOStats() override;
+
+ private:
+ // Get table file name in output_path
+ std::string GetTableFileName(uint64_t file_number) override;
+ // Specific the compaction output path, otherwise it uses default DB path
+ const std::string output_path_;
+
+ // Compaction job input
+ const CompactionServiceInput& compaction_input_;
+
+ // Compaction job result
+ CompactionServiceResult* compaction_result_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..930270778
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,975 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+ std::string r;
+ test::CompressibleString(rnd, ratio, len, &r);
+ return r;
+}
+
+std::string Key(uint64_t key, int length) {
+ const int kBufSize = 1000;
+ char buf[kBufSize];
+ if (length > kBufSize) {
+ length = kBufSize;
+ }
+ snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+ return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+ public testing::WithParamInterface<bool> {
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ Env* env_;
+ DB* db_;
+ std::vector<ColumnFamilyHandle*> handles_;
+ uint32_t max_subcompactions_;
+
+ Options last_options_;
+
+ CompactionJobStatsTest() : env_(Env::Default()) {
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+ alternative_wal_dir_ = dbname_ + "/wal";
+ Options options;
+ options.create_if_missing = true;
+ max_subcompactions_ = GetParam();
+ options.max_subcompactions = max_subcompactions_;
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ Reopen(options);
+ }
+
+ ~CompactionJobStatsTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ EXPECT_OK(DestroyDB(dbname_, options));
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+ }
+ }
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+ }
+
+ void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); }
+
+ void Close() {
+ for (auto h : handles_) {
+ delete h;
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(const Options& options) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+
+ Status ReadOnlyReopen(const Options& options) {
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+ }
+
+ Status TryReopen(const Options& options) {
+ Close();
+ last_options_ = options;
+ return DB::Open(options, dbname_, &db_);
+ }
+
+ Status Flush(int cf = 0) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+ }
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, k, v);
+ }
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+
+ Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+ Status Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level, int cf = 0) {
+ std::string property;
+ if (cf == 0) {
+ // default cfd
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + std::to_string(level), &property));
+ } else {
+ EXPECT_TRUE(db_->GetProperty(
+ handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+ &property));
+ }
+ return atoi(property.c_str());
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel(int cf = 0) {
+ int num_levels =
+ (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+ std::string result;
+ size_t last_non_zero_offset = 0;
+ for (int level = 0; level < num_levels; level++) {
+ int f = NumTableFilesAtLevel(level, cf);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ Status Size(uint64_t* size, const Slice& start, const Slice& limit,
+ int cf = 0) {
+ Range r(start, limit);
+ if (cf == 0) {
+ return db_->GetApproximateSizes(&r, 1, size);
+ } else {
+ return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+ }
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit,
+ uint32_t target_path_id) {
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = target_path_id;
+ ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+ }
+
+ void Compact(int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(
+ db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+ }
+
+ void Compact(const Slice& start, const Slice& limit) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+ }
+
+ void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+ ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+ true /* disallow trivial move */));
+ }
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large,
+ int cf = 0) {
+ for (int i = 0; i < n; i++) {
+ ASSERT_OK(Put(cf, small, "begin"));
+ ASSERT_OK(Put(cf, large, "end"));
+ ASSERT_OK(Flush(cf));
+ }
+ }
+
+ static void SetDeletionCompactionStats(CompactionJobStats* stats,
+ uint64_t input_deletions,
+ uint64_t expired_deletions,
+ uint64_t records_replaced) {
+ stats->num_input_deletion_records = input_deletions;
+ stats->num_expired_deletion_records = expired_deletions;
+ stats->num_records_replaced = records_replaced;
+ }
+
+ void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest,
+ int key_size, int value_size, uint64_t interval,
+ double ratio, int cf = 0) {
+ for (auto key = smallest; key < largest; key += interval) {
+ ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+ Slice(RandomString(rnd, value_size, ratio))));
+ }
+ ASSERT_OK(Flush(cf));
+ }
+
+ // This function behaves with the implicit understanding that two
+ // rounds of keys are inserted into the database, as per the behavior
+ // of the DeletionStatsTest.
+ void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+ uint64_t interval, int deletion_interval,
+ int key_size, uint64_t cutoff_key_num,
+ CompactionJobStats* stats, int cf = 0) {
+ // interval needs to be >= 2 so that deletion entries can be inserted
+ // that are intended to not result in an actual key deletion by using
+ // an offset of 1 from another existing key
+ ASSERT_GE(interval, 2);
+
+ uint64_t ctr = 1;
+ uint32_t deletions_made = 0;
+ uint32_t num_deleted = 0;
+ uint32_t num_expired = 0;
+ for (auto key = smallest; key <= largest; key += interval, ctr++) {
+ if (ctr % deletion_interval == 0) {
+ ASSERT_OK(Delete(cf, Key(key, key_size)));
+ deletions_made++;
+ num_deleted++;
+
+ if (key > cutoff_key_num) {
+ num_expired++;
+ }
+ }
+ }
+
+ // Insert some deletions for keys that don't exist that
+ // are both in and out of the key range
+ ASSERT_OK(Delete(cf, Key(smallest + 1, key_size)));
+ deletions_made++;
+
+ ASSERT_OK(Delete(cf, Key(smallest - 1, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Delete(cf, Key(smallest - 9, key_size)));
+ deletions_made++;
+ num_expired++;
+
+ ASSERT_OK(Flush(cf));
+ SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted);
+ }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+ CompactionJobStatsChecker()
+ : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+ size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+ void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+ // Once a compaction completed, this function will verify the returned
+ // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+ // in "expected_stats_" which has not yet being used for verification.
+ void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+ if (verify_next_comp_io_stats_) {
+ ASSERT_GT(ci.stats.file_write_nanos, 0);
+ ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+ ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+ ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+ verify_next_comp_io_stats_ = false;
+ }
+
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (expected_stats_.size()) {
+ Verify(ci.stats, expected_stats_.front());
+ expected_stats_.pop();
+ }
+ }
+
+ // A helper function which verifies whether two CompactionJobStats
+ // match. The verification of all compaction stats are done by
+ // ASSERT_EQ except for the total input / output bytes, which we
+ // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+ // 10% in uncompressed case and 20% when compression is used.
+ virtual void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) {
+ // time
+ ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(current_stats.num_input_records, stats.num_input_records);
+ ASSERT_EQ(current_stats.num_input_files, stats.num_input_files);
+ ASSERT_EQ(current_stats.num_input_files_at_output_level,
+ stats.num_input_files_at_output_level);
+
+ ASSERT_EQ(current_stats.num_output_records, stats.num_output_records);
+ ASSERT_EQ(current_stats.num_output_files, stats.num_output_files);
+
+ ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction);
+ ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction);
+
+ // file size
+ double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+ ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+ stats.total_input_bytes);
+ ASSERT_LE(current_stats.total_input_bytes,
+ stats.total_input_bytes * (1.00 + kFileSizeBias));
+ ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+ stats.total_output_bytes);
+ ASSERT_LE(current_stats.total_output_bytes,
+ stats.total_output_bytes * (1.00 + kFileSizeBias));
+ ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+ stats.total_input_raw_key_bytes);
+ ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+ stats.total_input_raw_value_bytes);
+
+ ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+
+ ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix),
+ std::string(stats.smallest_output_key_prefix));
+ ASSERT_EQ(std::string(current_stats.largest_output_key_prefix),
+ std::string(stats.largest_output_key_prefix));
+ }
+
+ // Add an expected compaction stats, which will be used to
+ // verify the CompactionJobStats returned by the OnCompactionCompleted()
+ // callback.
+ void AddExpectedStats(const CompactionJobStats& stats) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ expected_stats_.push(stats);
+ }
+
+ void EnableCompression(bool flag) { compression_enabled_ = flag; }
+
+ bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+ std::mutex mutex_;
+ std::queue<CompactionJobStats> expected_stats_;
+ bool compression_enabled_;
+ bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+ // Verifies whether two CompactionJobStats match.
+ void Verify(const CompactionJobStats& current_stats,
+ const CompactionJobStats& stats) override {
+ ASSERT_EQ(current_stats.num_input_deletion_records,
+ stats.num_input_deletion_records);
+ ASSERT_EQ(current_stats.num_expired_deletion_records,
+ stats.num_expired_deletion_records);
+ ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+ ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+ }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size,
+ size_t value_size, double compression_ratio = 1.0,
+ size_t block_size = 4096,
+ int bloom_bits_per_key = 10) {
+ const size_t kPerKeyOverhead = 8;
+ const size_t kFooterSize = 512;
+
+ uint64_t data_size = static_cast<uint64_t>(
+ num_records *
+ (key_size + value_size * compression_ratio + kPerKeyOverhead));
+
+ return data_size + kFooterSize +
+ num_records * bloom_bits_per_key / 8 // filter block
+ + data_size * (key_size + 8) / block_size; // index block
+}
+
+namespace {
+
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+ assert(prefix_length > 0);
+ size_t length = src.size() > prefix_length ? prefix_length : src.size();
+ dst->assign(src.data(), length);
+}
+
+} // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+ const std::string& smallest_key, const std::string& largest_key,
+ size_t num_input_files, size_t num_input_files_at_output_level,
+ uint64_t num_input_records, size_t key_size, size_t value_size,
+ size_t num_output_files, uint64_t num_output_records,
+ double compression_ratio, uint64_t num_records_replaced,
+ bool is_full = false, bool is_manual = true) {
+ CompactionJobStats stats;
+ stats.Reset();
+
+ stats.num_input_records = num_input_records;
+ stats.num_input_files = num_input_files;
+ stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+ stats.num_output_records = num_output_records;
+ stats.num_output_files = num_output_files;
+
+ stats.total_input_bytes =
+ EstimatedFileSize(num_input_records / num_input_files, key_size,
+ value_size, compression_ratio) *
+ num_input_files;
+ stats.total_output_bytes =
+ EstimatedFileSize(num_output_records / num_output_files, key_size,
+ value_size, compression_ratio) *
+ num_output_files;
+ stats.total_input_raw_key_bytes = num_input_records * (key_size + 8);
+ stats.total_input_raw_value_bytes = num_input_records * value_size;
+
+ stats.is_full_compaction = is_full;
+ stats.is_manual_compaction = is_manual;
+
+ stats.num_records_replaced = num_records_replaced;
+
+ CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength,
+ &stats.smallest_output_key_prefix);
+ CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength,
+ &stats.largest_output_key_prefix);
+
+ return stats;
+}
+
+CompressionType GetAnyCompression() {
+ if (Snappy_Supported()) {
+ return kSnappyCompression;
+ } else if (Zlib_Supported()) {
+ return kZlibCompression;
+ } else if (BZip2_Supported()) {
+ return kBZip2Compression;
+ } else if (LZ4_Supported()) {
+ return kLZ4Compression;
+ } else if (XPRESS_Supported()) {
+ return kXpressCompression;
+ }
+
+ return kNoCompression;
+}
+
+} // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+ Random rnd(301);
+ const int kBufSize = 100;
+ char buf[kBufSize];
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 100;
+ const int kTestScale = 8;
+ const int kKeySize = 10;
+ const int kValueSize = 1000;
+ const double kCompressionRatio = 0.5;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect. The expected CompactionJobStats is added
+ // via AddExpectedStats().
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ // just enough setting to hold off auto-compaction.
+ options.level0_file_num_compaction_trigger = kTestScale + 1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options.bytes_per_sync = 512 * 1024;
+
+ options.report_bg_io_stats = true;
+ for (int test = 0; test < 2; ++test) {
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // 1st Phase: generate "num_L0_files" L0 files.
+ int num_L0_files = 0;
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1,
+ kKeySize, kValueSize, key_interval,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d", ++num_L0_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+ ASSERT_EQ(std::to_string(num_L0_files), FilesPerLevel(1));
+
+ // 2nd Phase: perform L0 -> L1 compaction.
+ int L0_compaction_count = 6;
+ int count = 1;
+ std::string smallest_key;
+ std::string largest_key;
+ for (uint64_t start_key = key_base;
+ start_key <= key_base * L0_compaction_count;
+ start_key += key_base, count++) {
+ smallest_key = Key(start_key, 10);
+ largest_key = Key(start_key + key_base - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize,
+ kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // compact two files into one in the last L0 -> L1 compaction
+ int num_remaining_L0 = num_L0_files - L0_compaction_count;
+ smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ smallest_key, largest_key, num_remaining_L0, 0,
+ num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1,
+ num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ TEST_Compact(0, 1, smallest_key, largest_key);
+
+ int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+ num_L0_files = 0;
+ snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+ // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+ int sparseness = 2;
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base * sparseness) {
+ MakeTableWithKeyValues(
+ &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize,
+ kValueSize, key_base * sparseness / num_keys_per_L0_file,
+ compression_ratio, 1);
+ snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+ // When subcompactions are enabled, the number of output files increases
+ // by 1 because multiple threads are consuming the input and generating
+ // output files without coordinating to see if the output could fit into
+ // a smaller number of files like it does when it runs sequentially
+ int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+ for (uint64_t start_key = key_base; num_L0_files > 1;
+ start_key += key_base * sparseness) {
+ smallest_key = Key(start_key, 10);
+ largest_key = Key(start_key + key_base * sparseness - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize,
+ kValueSize, num_output_files,
+ num_keys_per_L0_file * 2, // 1/3 of the data will be updated.
+ compression_ratio, num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+ if (options.max_subcompactions == 1) {
+ --num_L1_files;
+ }
+ snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+ ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+ }
+
+ // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+ // Here we expect to have 1 L0 files and 4 L1 files
+ // In the first sub-compaction, we expect L0 compaction.
+ smallest_key = Key(key_base, 10);
+ largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1,
+ num_keys_per_L0_file * 3, kKeySize, kValueSize, 1,
+ num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file));
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+ Compact(1, smallest_key, largest_key);
+
+ num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+ char L1_buf[4];
+ snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+ std::string L1_files(L1_buf);
+ ASSERT_EQ(L1_files, FilesPerLevel(1));
+ options.compression = GetAnyCompression();
+ if (options.compression == kNoCompression) {
+ break;
+ }
+ stats_checker->EnableCompression(true);
+ compression_ratio = kCompressionRatio;
+
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+ Slice(RandomString(&rnd, 512 * 1024, 1))));
+ }
+
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+
+ stats_checker->set_verify_next_comp_io_stats(true);
+ std::atomic<bool> first_prepare_write(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+ if (first_prepare_write.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_prepare_write.store(false);
+ }
+ });
+
+ std::atomic<bool> first_flush(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+ if (first_flush.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_flush.store(false);
+ }
+ });
+
+ std::atomic<bool> first_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+ if (first_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_sync.store(false);
+ }
+ });
+
+ std::atomic<bool> first_range_sync(true);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+ if (first_range_sync.load()) {
+ options.env->SleepForMicroseconds(3);
+ first_range_sync.store(false);
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ Compact(1, smallest_key, largest_key);
+
+ ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+ ASSERT_TRUE(!first_prepare_write.load());
+ ASSERT_TRUE(!first_flush.load());
+ ASSERT_TRUE(!first_sync.load());
+ ASSERT_TRUE(!first_range_sync.load());
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_L0_file = 20;
+ const int kTestScale = 8; // make sure this is even
+ const int kKeySize = 10;
+ const int kValueSize = 100;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_L0_file;
+ uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+ uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+ const std::string smallest_key = Key(key_base - 10, kKeySize);
+ const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+ // Whenever a compaction completes, this listener will try to
+ // verify whether the returned CompactionJobStats matches
+ // what we expect.
+ auto* stats_checker = new CompactionJobDeletionStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger = kTestScale + 1;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_multiplier = 2;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Stage 1: Generate several L0 files and then send them to L2 by
+ // using CompactRangeOptions and CompactRange(). These files will
+ // have a strict subset of the keys from the full key-range
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+ kValueSize, key_interval, compression_ratio, 1);
+ }
+
+ CompactRangeOptions cr_options;
+ cr_options.change_level = true;
+ cr_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr));
+ ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+ // Stage 2: Generate files including keys from the entire key range
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+ kValueSize, key_interval, compression_ratio, 1);
+ }
+
+ // Send these L0 files to L1
+ TEST_Compact(0, 1, smallest_key, largest_key);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+ // Add a new record and flush so now there is a L0 file
+ // with a value too (not just deletions from the next step)
+ ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test"));
+ ASSERT_OK(Flush(1));
+
+ // Stage 3: Generate L0 files with some deletions so now
+ // there are files with the same key range in L0, L1, and L2
+ int deletion_interval = 3;
+ CompactionJobStats first_compaction_stats;
+ SelectivelyDeleteKeys(key_base, largest_key_num, key_interval,
+ deletion_interval, kKeySize, cutoff_key_num,
+ &first_compaction_stats, 1);
+
+ stats_checker->AddExpectedStats(first_compaction_stats);
+
+ // Stage 4: Trigger compaction and verify the stats
+ TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+ uint32_t compaction_input_units;
+ for (compaction_input_units = 1; num_flushes >= compaction_input_units;
+ compaction_input_units *= 2) {
+ if ((num_flushes & compaction_input_units) != 0) {
+ return compaction_input_units > 1 ? compaction_input_units : 0;
+ }
+ }
+ return 0;
+}
+} // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+ Random rnd(301);
+ uint64_t key_base = 100000000l;
+ // Note: key_base must be multiple of num_keys_per_L0_file
+ int num_keys_per_table = 100;
+ const uint32_t kTestScale = 6;
+ const int kKeySize = 10;
+ const int kValueSize = 900;
+ double compression_ratio = 1.0;
+ uint64_t key_interval = key_base / num_keys_per_table;
+
+ auto* stats_checker = new CompactionJobStatsChecker();
+ Options options;
+ options.listeners.emplace_back(stats_checker);
+ options.create_if_missing = true;
+ options.num_levels = 3;
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 2;
+ options.target_file_size_base = num_keys_per_table * 1000;
+ options.compaction_style = kCompactionStyleUniversal;
+ options.compaction_options_universal.size_ratio = 1;
+ options.compaction_options_universal.max_size_amplification_percent = 1000;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Generates the expected CompactionJobStats for each compaction
+ for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+ // Here we treat one newly flushed file as an unit.
+ //
+ // For example, if a newly flushed file is 100k, and a compaction has
+ // 4 input units, then this compaction inputs 400k.
+ uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+ if (num_input_units == 0) {
+ continue;
+ }
+ // A full compaction only happens when the number of flushes equals to
+ // the number of compaction input runs.
+ bool is_full = num_flushes == num_input_units;
+ // The following statement determines the expected smallest key
+ // based on whether it is a full compaction.
+ uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1);
+
+ stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+ Key(smallest_key, 10),
+ Key(smallest_key + key_base * num_input_units - key_interval, 10),
+ num_input_units, num_input_units > 2 ? num_input_units / 2 : 0,
+ num_keys_per_table * num_input_units, kKeySize, kValueSize,
+ num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full,
+ false));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+ for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+ start_key += key_base) {
+ MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+ kValueSize, key_interval, compression_ratio, 1);
+ ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+ }
+ ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+ ::testing::Values(1, 4));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..c87871100
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,2451 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/options_helper.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+ const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+ ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+ ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+ ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+ ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+ ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+ ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+ ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+ ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+ ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+ ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif // !defined(IOS_CROSS_COMPILE)
+}
+
+// Mock FSWritableFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestWritableFile : public FSWritableFileOwnerWrapper {
+ public:
+ MockTestWritableFile(std::unique_ptr<FSWritableFile>&& file,
+ Env::IOPriority io_priority)
+ : FSWritableFileOwnerWrapper(std::move(file)),
+ write_io_priority_(io_priority) {}
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Append(data, options, dbg);
+ }
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& verification_info,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Append(data, options, verification_info, dbg);
+ }
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Close(options, dbg);
+ }
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Flush(options, dbg);
+ }
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Sync(options, dbg);
+ }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Fsync(options, dbg);
+ }
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->GetFileSize(options, dbg);
+ }
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->RangeSync(offset, nbytes, options, dbg);
+ }
+
+ void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ target()->PrepareWrite(offset, len, options, dbg);
+ }
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+ return target()->Allocate(offset, len, options, dbg);
+ }
+
+ private:
+ Env::IOPriority write_io_priority_;
+};
+
+// Mock FSRandomAccessFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+ MockTestRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+ Env::IOPriority io_priority)
+ : FSRandomAccessFileOwnerWrapper(std::move(file)),
+ read_io_priority_(io_priority) {}
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+ return target()->Read(offset, n, options, result, scratch, dbg);
+ }
+ IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+ IODebugContext* dbg) override {
+ EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+ return target()->Prefetch(offset, n, options, dbg);
+ }
+
+ private:
+ Env::IOPriority read_io_priority_;
+};
+
+// Mock FileSystem for testing io priority.
+class MockTestFileSystem : public FileSystemWrapper {
+ public:
+ explicit MockTestFileSystem(const std::shared_ptr<FileSystem>& base,
+ Env::IOPriority read_io_priority,
+ Env::IOPriority write_io_priority)
+ : FileSystemWrapper(base),
+ read_io_priority_(read_io_priority),
+ write_io_priority_(write_io_priority) {}
+
+ static const char* kClassName() { return "MockTestFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+ EXPECT_OK(s);
+ result->reset(
+ new MockTestRandomAccessFile(std::move(*result), read_io_priority_));
+ return s;
+ }
+ IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+ EXPECT_OK(s);
+ result->reset(
+ new MockTestWritableFile(std::move(*result), write_io_priority_));
+ return s;
+ }
+
+ private:
+ Env::IOPriority read_io_priority_;
+ Env::IOPriority write_io_priority_;
+};
+
+enum TableTypeForTest : uint8_t { kMockTable = 0, kBlockBasedTable = 1 };
+
+} // namespace
+
+class CompactionJobTestBase : public testing::Test {
+ protected:
+ CompactionJobTestBase(std::string dbname, const Comparator* ucmp,
+ std::function<std::string(uint64_t)> encode_u64_ts,
+ bool test_io_priority, TableTypeForTest table_type)
+ : dbname_(std::move(dbname)),
+ ucmp_(ucmp),
+ db_options_(),
+ mutable_cf_options_(cf_options_),
+ mutable_db_options_(),
+ table_cache_(NewLRUCache(50000, 16)),
+ write_buffer_manager_(db_options_.db_write_buffer_size),
+ versions_(new VersionSet(
+ dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr,
+ /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")),
+ shutting_down_(false),
+ mock_table_factory_(new mock::MockTableFactory()),
+ error_handler_(nullptr, db_options_, &mutex_),
+ encode_u64_ts_(std::move(encode_u64_ts)),
+ test_io_priority_(test_io_priority),
+ table_type_(table_type) {
+ Env* base_env = Env::Default();
+ EXPECT_OK(
+ test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+ env_ = base_env;
+ fs_ = env_->GetFileSystem();
+ // set default for the tests
+ mutable_cf_options_.target_file_size_base = 1024 * 1024;
+ mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024;
+ }
+
+ void SetUp() override {
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+ db_options_.env = env_;
+ db_options_.fs = fs_;
+ db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
+ cf_options_.comparator = ucmp_;
+ if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+ BlockBasedTableOptions table_options;
+ cf_options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ } else if (table_type_ == TableTypeForTest::kMockTable) {
+ cf_options_.table_factory = mock_table_factory_;
+ } else {
+ assert(false);
+ }
+ }
+
+ std::string GenerateFileName(uint64_t file_number) {
+ FileMetaData meta;
+ std::vector<DbPath> db_paths;
+ db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+ meta.fd = FileDescriptor(file_number, 0, 0);
+ return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+ }
+
+ std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+ const ValueType t, uint64_t ts = 0) {
+ std::string user_key_with_ts = user_key + encode_u64_ts_(ts);
+ return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString();
+ }
+
+ static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size) {
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+ uint64_t size, uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+ size, kNoCompression);
+ return blob_index;
+ }
+
+ static std::string BlobStrInlinedTTL(const Slice& value,
+ uint64_t expiration) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+ return blob_index;
+ }
+
+ // Creates a table with the specificied key value pairs.
+ void CreateTable(const std::string& table_name,
+ const mock::KVVector& contents, uint64_t& file_size) {
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(),
+ &file_writer, nullptr);
+ ASSERT_OK(s);
+ std::unique_ptr<TableBuilder> table_builder(
+ cf_options_.table_factory->NewTableBuilder(
+ TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_,
+ cfd_->internal_comparator(),
+ cfd_->int_tbl_prop_collector_factories(),
+ CompressionType::kNoCompression,
+ CompressionOptions(), 0 /* column_family_id */,
+ kDefaultColumnFamilyName, -1 /* level */),
+ file_writer.get()));
+ // Build table.
+ for (auto kv : contents) {
+ std::string key;
+ std::string value;
+ std::tie(key, value) = kv;
+ table_builder->Add(key, value);
+ }
+ ASSERT_OK(table_builder->Finish());
+ file_size = table_builder->FileSize();
+ }
+
+ void AddMockFile(const mock::KVVector& contents, int level = 0) {
+ assert(contents.size() > 0);
+
+ bool first_key = true;
+ std::string smallest, largest;
+ InternalKey smallest_key, largest_key;
+ SequenceNumber smallest_seqno = kMaxSequenceNumber;
+ SequenceNumber largest_seqno = 0;
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+ for (auto kv : contents) {
+ ParsedInternalKey key;
+ std::string skey;
+ std::string value;
+ std::tie(skey, value) = kv;
+ const Status pik_status =
+ ParseInternalKey(skey, &key, true /* log_err_key */);
+
+ smallest_seqno = std::min(smallest_seqno, key.sequence);
+ largest_seqno = std::max(largest_seqno, key.sequence);
+
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+ smallest.assign(key.user_key.data(), key.user_key.size());
+ smallest_key.DecodeFrom(skey);
+ }
+ if (first_key ||
+ cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+ largest.assign(key.user_key.data(), key.user_key.size());
+ largest_key.DecodeFrom(skey);
+ }
+
+ first_key = false;
+
+ if (pik_status.ok() && key.type == kTypeBlobIndex) {
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ continue;
+ }
+
+ if (blob_index.IsInlined() || blob_index.HasTTL() ||
+ blob_index.file_number() == kInvalidBlobFileNumber) {
+ continue;
+ }
+
+ if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+ oldest_blob_file_number > blob_index.file_number()) {
+ oldest_blob_file_number = blob_index.file_number();
+ }
+ }
+ }
+
+ uint64_t file_number = versions_->NewFileNumber();
+
+ uint64_t file_size = 0;
+ if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+ CreateTable(GenerateFileName(file_number), contents, file_size);
+ } else if (table_type_ == TableTypeForTest::kMockTable) {
+ file_size = 10;
+ EXPECT_OK(mock_table_factory_->CreateMockTable(
+ env_, GenerateFileName(file_number), std::move(contents)));
+ } else {
+ assert(false);
+ }
+
+ VersionEdit edit;
+ edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key,
+ smallest_seqno, largest_seqno, false, Temperature::kUnknown,
+ oldest_blob_file_number, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+ mutex_.Lock();
+ EXPECT_OK(
+ versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options_, &edit, &mutex_, nullptr));
+ mutex_.Unlock();
+ }
+
+ void VerifyTables(int output_level,
+ const std::vector<mock::KVVector>& expected_results,
+ std::vector<uint64_t> expected_oldest_blob_file_numbers) {
+ if (expected_results.empty()) {
+ ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+ return;
+ }
+ int expected_output_file_num = 0;
+ for (const auto& e : expected_results) {
+ if (!e.empty()) {
+ ++expected_output_file_num;
+ }
+ }
+ ASSERT_EQ(expected_output_file_num, compaction_job_stats_.num_output_files);
+ if (expected_output_file_num == 0) {
+ return;
+ }
+
+ if (expected_oldest_blob_file_numbers.empty()) {
+ expected_oldest_blob_file_numbers.resize(expected_output_file_num,
+ kInvalidBlobFileNumber);
+ }
+
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ if (table_type_ == TableTypeForTest::kMockTable) {
+ ASSERT_EQ(compaction_job_stats_.num_output_files,
+ expected_results.size());
+ mock_table_factory_->AssertLatestFiles(expected_results);
+ } else {
+ assert(table_type_ == TableTypeForTest::kBlockBasedTable);
+ }
+
+ auto output_files =
+ cfd->current()->storage_info()->LevelFiles(output_level);
+ ASSERT_EQ(expected_output_file_num, output_files.size());
+
+ if (table_type_ == TableTypeForTest::kMockTable) {
+ assert(output_files.size() ==
+ static_cast<size_t>(expected_output_file_num));
+ const FileMetaData* const output_file = output_files[0];
+ ASSERT_EQ(output_file->oldest_blob_file_number,
+ expected_oldest_blob_file_numbers[0]);
+ return;
+ }
+
+ for (size_t i = 0; i < expected_results.size(); ++i) {
+ const FileMetaData* const output_file = output_files[i];
+ std::string file_name = GenerateFileName(output_file->fd.GetNumber());
+ const auto& fs = env_->GetFileSystem();
+ std::unique_ptr<RandomAccessFileReader> freader;
+ IOStatus ios = RandomAccessFileReader::Create(
+ fs, file_name, FileOptions(), &freader, nullptr);
+ ASSERT_OK(ios);
+ std::unique_ptr<TableReader> table_reader;
+ uint64_t file_size = output_file->fd.GetFileSize();
+ ReadOptions read_opts;
+ Status s = cf_options_.table_factory->NewTableReader(
+ read_opts,
+ TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
+ cfd_->internal_comparator()),
+ std::move(freader), file_size, &table_reader, false);
+ ASSERT_OK(s);
+ assert(table_reader);
+ std::unique_ptr<InternalIterator> iiter(
+ table_reader->NewIterator(read_opts, nullptr, nullptr, true,
+ TableReaderCaller::kUncategorized));
+ assert(iiter);
+
+ mock::KVVector from_db;
+ for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
+ const Slice key = iiter->key();
+ const Slice value = iiter->value();
+ from_db.emplace_back(
+ make_pair(key.ToString(false), value.ToString(false)));
+ }
+ ASSERT_EQ(expected_results[i], from_db);
+ }
+ }
+
+ void SetLastSequence(const SequenceNumber sequence_number) {
+ versions_->SetLastAllocatedSequence(sequence_number + 1);
+ versions_->SetLastPublishedSequence(sequence_number + 1);
+ versions_->SetLastSequence(sequence_number + 1);
+ }
+
+ // returns expected result after compaction
+ mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) {
+ stl_wrappers::KVMap expected_results;
+ constexpr int kKeysPerFile = 10000;
+ constexpr int kCorruptKeysPerFile = 200;
+ constexpr int kMatchingKeys = kKeysPerFile / 2;
+ SequenceNumber sequence_number = 0;
+
+ auto corrupt_id = [&](int id) {
+ return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+ };
+
+ for (int i = 0; i < 2; ++i) {
+ auto contents = mock::MakeMockFile();
+ for (int k = 0; k < kKeysPerFile; ++k) {
+ auto key = std::to_string(i * kMatchingKeys + k);
+ auto value = std::to_string(i * kKeysPerFile + k);
+ InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+ // This is how the key will look like once it's written in bottommost
+ // file
+ InternalKey bottommost_internal_key(key, 0, kTypeValue);
+
+ if (corrupt_id(k)) {
+ test::CorruptKeyType(&internal_key);
+ test::CorruptKeyType(&bottommost_internal_key);
+ }
+ contents.push_back({internal_key.Encode().ToString(), value});
+ if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+ expected_results.insert(
+ {bottommost_internal_key.Encode().ToString(), value});
+ }
+ }
+ mock::SortKVVector(&contents, ucmp_);
+
+ AddMockFile(contents);
+ }
+
+ SetLastSequence(sequence_number);
+
+ mock::KVVector expected_results_kvvector;
+ for (auto& kv : expected_results) {
+ expected_results_kvvector.push_back({kv.first, kv.second});
+ }
+
+ return expected_results_kvvector;
+ }
+
+ void NewDB() {
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+ std::shared_ptr<Logger> info_log;
+ DBOptions db_opts = BuildDBOptions(db_options_, mutable_db_options_);
+ Status s = CreateLoggerFromOptions(dbname_, db_opts, &info_log);
+ ASSERT_OK(s);
+ db_options_.info_log = info_log;
+
+ versions_.reset(
+ new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+ &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ /*db_id*/ "", /*db_session_id*/ ""));
+ compaction_job_stats_.Reset();
+ ASSERT_OK(SetIdentityFile(env_, dbname_));
+
+ VersionEdit new_db;
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ std::unique_ptr<WritableFileWriter> file_writer;
+ const auto& fs = env_->GetFileSystem();
+ s = WritableFileWriter::Create(fs, manifest,
+ fs->OptimizeForManifestWrite(env_options_),
+ &file_writer, nullptr);
+
+ ASSERT_OK(s);
+ {
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ }
+ ASSERT_OK(s);
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+
+ ASSERT_OK(s);
+
+ cf_options_.merge_operator = merge_op_;
+ cf_options_.compaction_filter = compaction_filter_.get();
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+ ASSERT_OK(versions_->Recover(column_families, false));
+ cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+ }
+
+ // input_files[i] on input_levels[i]
+ void RunLastLevelCompaction(
+ const std::vector<std::vector<FileMetaData*>>& input_files,
+ const std::vector<int> input_levels,
+ std::function<void(Compaction& comp)>&& verify_func,
+ const std::vector<SequenceNumber>& snapshots = {}) {
+ const int kLastLevel = cf_options_.num_levels - 1;
+ verify_per_key_placement_ = std::move(verify_func);
+ mock::KVVector empty_map;
+ RunCompaction(input_files, input_levels, {empty_map}, snapshots,
+ kMaxSequenceNumber, kLastLevel, false);
+ }
+
+ // input_files[i] on input_levels[i]
+ void RunCompaction(
+ const std::vector<std::vector<FileMetaData*>>& input_files,
+ const std::vector<int>& input_levels,
+ const std::vector<mock::KVVector>& expected_results,
+ const std::vector<SequenceNumber>& snapshots = {},
+ SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+ int output_level = 1, bool verify = true,
+ std::vector<uint64_t> expected_oldest_blob_file_numbers = {},
+ bool check_get_priority = false,
+ Env::IOPriority read_io_priority = Env::IO_TOTAL,
+ Env::IOPriority write_io_priority = Env::IO_TOTAL,
+ int max_subcompactions = 0) {
+ // For compaction, set fs as MockTestFileSystem to check the io_priority.
+ if (test_io_priority_) {
+ db_options_.fs.reset(
+ new MockTestFileSystem(fs_, read_io_priority, write_io_priority));
+ }
+
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+ size_t num_input_files = 0;
+ std::vector<CompactionInputFiles> compaction_input_files;
+ for (size_t i = 0; i < input_files.size(); ++i) {
+ auto level_files = input_files[i];
+ CompactionInputFiles compaction_level;
+ compaction_level.level = input_levels[i];
+ compaction_level.files.insert(compaction_level.files.end(),
+ level_files.begin(), level_files.end());
+ compaction_input_files.push_back(compaction_level);
+ num_input_files += level_files.size();
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ // it should actually be the next non-empty level
+ const int kGrandparentsLevel = output_level + 1;
+ if (kGrandparentsLevel < cf_options_.num_levels) {
+ grandparents =
+ cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel);
+ }
+
+ Compaction compaction(
+ cfd->current()->storage_info(), *cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+ compaction_input_files, output_level,
+ mutable_cf_options_.target_file_size_base,
+ mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
+ cfd->GetLatestMutableCFOptions()->compression_opts,
+ Temperature::kUnknown, max_subcompactions, grandparents, true);
+ compaction.SetInputVersion(cfd->current());
+
+ assert(db_options_.info_log);
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+ mutex_.Lock();
+ EventLogger event_logger(db_options_.info_log.get());
+ // TODO(yiwu) add a mock snapshot checker and add test for it.
+ SnapshotChecker* snapshot_checker = nullptr;
+ ASSERT_TRUE(full_history_ts_low_.empty() ||
+ ucmp_->timestamp_size() == full_history_ts_low_.size());
+ const std::atomic<bool> kManualCompactionCanceledFalse{false};
+ CompactionJob compaction_job(
+ 0, &compaction, db_options_, mutable_db_options_, env_options_,
+ versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
+ nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+ earliest_write_conflict_snapshot, snapshot_checker, nullptr,
+ table_cache_, &event_logger, false, false, dbname_,
+ &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
+ /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+ env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
+ full_history_ts_low_);
+ VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+ compaction_job.Prepare();
+ mutex_.Unlock();
+ Status s = compaction_job.Run();
+ ASSERT_OK(s);
+ ASSERT_OK(compaction_job.io_status());
+ mutex_.Lock();
+ ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+ ASSERT_OK(compaction_job.io_status());
+ mutex_.Unlock();
+ log_buffer.FlushBufferToLog();
+
+ if (verify) {
+ ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+ ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+ VerifyTables(output_level, expected_results,
+ expected_oldest_blob_file_numbers);
+ }
+
+ if (check_get_priority) {
+ CheckGetRateLimiterPriority(compaction_job);
+ }
+
+ if (verify_per_key_placement_) {
+ // Verify per_key_placement compaction
+ assert(compaction.SupportsPerKeyPlacement());
+ verify_per_key_placement_(compaction);
+ }
+ }
+
+ void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
+ // When the state from WriteController is normal.
+ ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW);
+
+ WriteController* write_controller =
+ compaction_job.versions_->GetColumnFamilySet()->write_controller();
+
+ {
+ // When the state from WriteController is Delayed.
+ std::unique_ptr<WriteControllerToken> delay_token =
+ write_controller->GetDelayToken(1000000);
+ ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+ }
+
+ {
+ // When the state from WriteController is Stopped.
+ std::unique_ptr<WriteControllerToken> stop_token =
+ write_controller->GetStopToken();
+ ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+ }
+ }
+
+ std::shared_ptr<Env> env_guard_;
+ Env* env_;
+ std::shared_ptr<FileSystem> fs_;
+ std::string dbname_;
+ const Comparator* const ucmp_;
+ EnvOptions env_options_;
+ ImmutableDBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ MutableCFOptions mutable_cf_options_;
+ MutableDBOptions mutable_db_options_;
+ std::shared_ptr<Cache> table_cache_;
+ WriteController write_controller_;
+ WriteBufferManager write_buffer_manager_;
+ std::unique_ptr<VersionSet> versions_;
+ InstrumentedMutex mutex_;
+ std::atomic<bool> shutting_down_;
+ std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+ CompactionJobStats compaction_job_stats_;
+ ColumnFamilyData* cfd_;
+ std::unique_ptr<CompactionFilter> compaction_filter_;
+ std::shared_ptr<MergeOperator> merge_op_;
+ ErrorHandler error_handler_;
+ std::string full_history_ts_low_;
+ const std::function<std::string(uint64_t)> encode_u64_ts_;
+ const bool test_io_priority_;
+ std::function<void(Compaction& comp)> verify_per_key_placement_;
+ const TableTypeForTest table_type_ = kMockTable;
+};
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public CompactionJobTestBase {
+ public:
+ CompactionJobTest()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_test"), BytewiseComparator(),
+ [](uint64_t /*ts*/) { return ""; }, /*test_io_priority=*/false,
+ TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTest, Simple) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) {
+ NewDB();
+
+ auto expected_results = CreateTwoFiles(true);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+ ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+ {KeyStr("c", 3U, kTypeValue), "val"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+ {KeyStr("b", 1U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+ SetLastSequence(4U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile();
+
+ SetLastSequence(4U);
+
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 3U, kTypeValue), "val2"},
+ {KeyStr("b", 4U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+ {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+ SetLastSequence(4U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ // Because level 1 is not the last level, the sequence numbers of a and b
+ // cannot be set to 0
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files =
+ cfd_->current()->storage_info()->LevelFiles(input_levels[0]);
+ auto lvl1_files =
+ cfd_->current()->storage_info()->LevelFiles(input_levels[1]);
+ RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeValue), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+ merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeMerge), "5"},
+ {KeyStr("a", 4U, kTypeMerge), "4"},
+ {KeyStr("a", 3U, kTypeMerge), "3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+ {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered
+ });
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+ {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+ {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
+ {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+ {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+ {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+ {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+ // b does not appear because the operands are filtered
+ });
+
+ SetLastSequence(5U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+ merge_op_ = MergeOperators::CreateUInt64AddOperator();
+ compaction_filter_.reset(new test::FilterNumber(10U));
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file2);
+
+ auto file3 =
+ mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+ {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+ AddMockFile(file3, 2);
+
+ SetLastSequence(11U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ mock::KVVector empty_map;
+ RunCompaction({files}, {input_level}, {empty_map});
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeDeletion), ""},
+ {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+ {KeyStr("b", 4U, kTypeValue), "val"}});
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("a", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+ SetLastSequence(6U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), "val1"},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 21U, kTypeValue), "val3"},
+ {KeyStr("d", 8U, kTypeValue), "val4"},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("A", 1U, kTypeValue), "val"},
+ {KeyStr("e", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 11U, kTypeValue), ""},
+ {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("b", 11U, kTypeValue), "val2"},
+ {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+ {KeyStr("c", 21U, kTypeValue), ""},
+ {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+ {KeyStr("f", 1U, kTypeValue), "val1"},
+ {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 12U, kTypeValue), "val1"},
+ {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("m", 8U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+ NewDB();
+
+ // Test multiple snapshots where the earliest snapshot is not a
+ // write-conflic-snapshot.
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), "val"},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), "val"},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), "val"},
+ {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 23U, kTypeValue), "val2"},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 23U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), "val2"},
+ {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 32U, kTypeValue), "val3"},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 13U, kTypeValue), "val"},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val3"},
+ {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 13U, kTypeValue), "val2"},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val5"},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 23U, kTypeValue), ""},
+ {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 23U, kTypeValue), ""},
+ {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 31U, kTypeValue), ""},
+ {KeyStr("H", 31U, kTypeValue), "val"},
+ {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 34U, kTypeValue), ""},
+ {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val4"},
+ {KeyStr("J", 34U, kTypeValue), "val"},
+ {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 25U, kTypeValue), "val2"},
+ {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 15U, kTypeValue), "val3"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ });
+
+ SetLastSequence(24U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U, 30U},
+ 20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("dummy", 5U, kTypeValue), "val2"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 0U, kTypeValue), "val"},
+ });
+ AddMockFile(file2);
+
+ auto expected_results = mock::MakeMockFile({
+ {KeyStr("dummy", 0U, kTypeValue), "val2"},
+ });
+
+ SetLastSequence(22U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+ // Tests three scenarios involving multiple single delete/put pairs:
+ //
+ // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+ // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+ // C: SDel Put SDel Snapshot Put -> Snapshot Put
+ // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+ // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+ // F: Put SDel Put Sdel Snapshot -> removed
+ // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+ // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+ // I: (Put) Snapshot Put SDel Put SDel -> SDel
+ // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+ // -> Snapshot Put
+ // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+ // -> Snapshot Put Snapshot SDel
+ // L: SDel Put SDel Put SDel Snapshot SDel Put SDel SDel Put SDel
+ // -> Snapshot SDel Put SDel
+ // M: (Put) SDel Put SDel Put SDel Snapshot Put SDel SDel Put SDel SDel
+ // -> SDel Snapshot Put SDel
+ NewDB();
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), "val5"},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), "val2"},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), "val4"},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("G", 13U, kTypeValue), "val"},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), "val"},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 12U, kTypeValue), "val"},
+ {KeyStr("J", 11U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), "val1"},
+ {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 12U, kTypeValue), "val2"},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), "val"},
+ {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 12U, kTypeValue), "val"},
+ {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 14U, kTypeValue), "val"},
+ {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 11U, kTypeValue), "val"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 11U, kTypeValue), "val2"},
+ {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("C", 9U, kTypeValue), "val6"},
+ {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), "val"},
+ {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 4U, kTypeValue), "val"},
+ {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 5U, kTypeValue), "val"},
+ {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("F", 3U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 5U, kTypeValue), "val"},
+ {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("H", 3U, kTypeValue), "val"},
+ {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 11U, kTypeValue), "val"},
+ {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 4U, kTypeValue), "val"},
+ {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("J", 2U, kTypeValue), "val"},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("K", 7U, kTypeValue), "val4"},
+ {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 5U, kTypeValue), "val5"},
+ {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 4U, kTypeValue), "val"},
+ {KeyStr("L", 3U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 2U, kTypeValue), "val"},
+ {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 7U, kTypeValue), "val"},
+ {KeyStr("M", 5U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 4U, kTypeValue), "val"},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+ });
+ AddMockFile(file2);
+
+ auto file3 = mock::MakeMockFile({
+ {KeyStr("D", 1U, kTypeValue), "val"},
+ {KeyStr("H", 1U, kTypeValue), "val"},
+ {KeyStr("I", 2U, kTypeValue), "val"},
+ });
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({
+ {KeyStr("M", 1U, kTypeValue), "val"},
+ });
+ AddMockFile(file4, 2);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 13U, kTypeValue), ""},
+ {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("A", 10U, kTypeValue), "val"},
+ {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("B", 13U, kTypeValue), ""},
+ {KeyStr("C", 14U, kTypeValue), "val3"},
+ {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("D", 11U, kTypeValue), ""},
+ {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("E", 11U, kTypeValue), ""},
+ {KeyStr("G", 15U, kTypeValue), "val"},
+ {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+ {KeyStr("I", 13U, kTypeValue), ""},
+ {KeyStr("J", 15U, kTypeValue), "val"},
+ {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 15U, kTypeValue), ""},
+ {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("K", 8U, kTypeValue), "val3"},
+ {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+ {KeyStr("L", 15U, kTypeValue), ""},
+ {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+ {KeyStr("M", 14U, kTypeValue), ""},
+ {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+ SetLastSequence(22U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results}, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+ {test::KeyStr("a", 5U, kTypeDeletion), ""},
+ {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+ {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+ AddMockFile(file2);
+
+ auto expected_results =
+ mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+ {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+ {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+ SetLastSequence(6U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+ NewDB();
+
+ // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+ // of identifying the oldest referenced blob file. Similarly, blob6 will be
+ // ignored because it has TTL and hence refers to a TTL blob file.
+ const stl_wrappers::KVMap::value_type blob1(
+ KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+ const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+ BlobStr(59, 123456, 999));
+ const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+ BlobStr(138, 1000, 1 << 8));
+ auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+ AddMockFile(file1);
+
+ const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+ BlobStr(199, 3 << 10, 1 << 20));
+ const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+ BlobStr(19, 6789, 333));
+ const stl_wrappers::KVMap::value_type blob6(
+ KeyStr("f", 6U, kTypeBlobIndex),
+ BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+ auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+ AddMockFile(file2);
+
+ const stl_wrappers::KVMap::value_type expected_blob1(
+ KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+ const stl_wrappers::KVMap::value_type expected_blob2(
+ KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+ const stl_wrappers::KVMap::value_type expected_blob3(
+ KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+ const stl_wrappers::KVMap::value_type expected_blob4(
+ KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+ const stl_wrappers::KVMap::value_type expected_blob5(
+ KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+ const stl_wrappers::KVMap::value_type expected_blob6(
+ KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+ auto expected_results =
+ mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+ expected_blob4, expected_blob5, expected_blob6});
+
+ SetLastSequence(6U);
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results},
+ std::vector<SequenceNumber>(), kMaxSequenceNumber,
+ /* output_level */ 1, /* verify */ true,
+ /* expected_oldest_blob_file_numbers */ {19});
+}
+
+TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
+ cf_options_.bottommost_temperature = Temperature::kCold;
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = true;
+ });
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ NewDB();
+
+ // Add files on different levels that may overlap
+ auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
+ AddMockFile(file0_1);
+
+ auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
+ {KeyStr("f", 11U, kTypeValue), "val"}});
+ AddMockFile(file1_1, 1);
+ auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
+ {KeyStr("k", 13U, kTypeValue), "val"}});
+ AddMockFile(file1_2, 1);
+ auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
+ {KeyStr("u", 15U, kTypeValue), "val"}});
+ AddMockFile(file1_3, 1);
+
+ auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
+ {KeyStr("h", 9U, kTypeValue), "val"}});
+ AddMockFile(file2_1, 2);
+ auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
+ {KeyStr("p", 7U, kTypeValue), "val"}});
+ AddMockFile(file2_2, 2);
+
+ auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
+ {KeyStr("k", 3U, kTypeValue), "val"}});
+ AddMockFile(file3_1, 3);
+ auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
+ {KeyStr("x", 5U, kTypeValue), "val"}});
+ AddMockFile(file3_2, 3);
+
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ const std::vector<int> input_levels = {0, 1, 2, 3};
+ auto files0 = cfd->current()->storage_info()->LevelFiles(input_levels[0]);
+ auto files1 = cfd->current()->storage_info()->LevelFiles(input_levels[1]);
+ auto files2 = cfd->current()->storage_info()->LevelFiles(input_levels[2]);
+ auto files3 = cfd->current()->storage_info()->LevelFiles(input_levels[3]);
+
+ RunLastLevelCompaction(
+ {files0, files1, files2, files3}, input_levels,
+ /*verify_func=*/[&](Compaction& comp) {
+ for (char c = 'a'; c <= 'z'; c++) {
+ std::string c_str;
+ c_str = c;
+ const Slice key(c_str);
+ if (c == 'a') {
+ ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
+ } else {
+ ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
+ }
+ }
+ });
+}
+
+TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
+ db_options_.enforce_single_del_contracts = false;
+ NewDB();
+
+ auto file =
+ mock::MakeMockFile({{KeyStr("a", 4U, kTypeSingleDeletion), ""},
+ {KeyStr("a", 3U, kTypeDeletion), "dontcare"}});
+ AddMockFile(file);
+ SetLastSequence(4U);
+
+ auto expected_results = mock::MakeMockFile();
+ constexpr int input_level = 0;
+ auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, InputSerialization) {
+ // Setup a random CompactionServiceInput
+ CompactionServiceInput input;
+ const int kStrMaxLen = 1000;
+ Random rnd(static_cast<uint32_t>(time(nullptr)));
+ Random64 rnd64(time(nullptr));
+ input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+ input.column_family.options.comparator = ReverseBytewiseComparator();
+ input.column_family.options.max_bytes_for_level_base =
+ rnd64.Uniform(UINT64_MAX);
+ input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
+ input.column_family.options.compression = kZSTD;
+ input.column_family.options.compression_opts.level = 4;
+ input.db_options.max_background_flushes = 10;
+ input.db_options.paranoid_checks = rnd.OneIn(2);
+ input.db_options.statistics = CreateDBStatistics();
+ input.db_options.env = env_;
+ while (!rnd.OneIn(10)) {
+ input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
+ }
+ while (!rnd.OneIn(10)) {
+ input.input_files.emplace_back(rnd.RandomString(
+ rnd.Uniform(kStrMaxLen - 1) +
+ 1)); // input file name should have at least one character
+ }
+ input.output_level = 4;
+ input.has_begin = rnd.OneIn(2);
+ if (input.has_begin) {
+ input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+ }
+ input.has_end = rnd.OneIn(2);
+ if (input.has_end) {
+ input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+ }
+
+ std::string output;
+ ASSERT_OK(input.Write(&output));
+
+ // Test deserialization
+ CompactionServiceInput deserialized1;
+ ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1));
+ ASSERT_TRUE(deserialized1.TEST_Equals(&input));
+
+ // Test mismatch
+ deserialized1.db_options.max_background_flushes += 10;
+ std::string mismatch;
+ ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
+ ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+
+ // Test unknown field
+ CompactionServiceInput deserialized2;
+ output.clear();
+ ASSERT_OK(input.Write(&output));
+ output.append("new_field=123;");
+
+ ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2));
+ ASSERT_TRUE(deserialized2.TEST_Equals(&input));
+
+ // Test missing field
+ CompactionServiceInput deserialized3;
+ deserialized3.output_level = 0;
+ std::string to_remove = "output_level=4;";
+ size_t pos = output.find(to_remove);
+ ASSERT_TRUE(pos != std::string::npos);
+ output.erase(pos, to_remove.length());
+ ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3));
+ mismatch.clear();
+ ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch));
+ ASSERT_EQ(mismatch, "output_level");
+
+ // manually set the value back, should match the original structure
+ deserialized3.output_level = 4;
+ ASSERT_TRUE(deserialized3.TEST_Equals(&input));
+
+ // Test invalid version
+ output.clear();
+ ASSERT_OK(input.Write(&output));
+
+ uint32_t data_version = DecodeFixed32(output.data());
+ const size_t kDataVersionSize = sizeof(data_version);
+ ASSERT_EQ(data_version,
+ 1U); // Update once the default data version is changed
+ char buf[kDataVersionSize];
+ EncodeFixed32(buf, data_version + 10); // make sure it's not valid
+ output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+ Status s = CompactionServiceInput::Read(output, &deserialized3);
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(CompactionJobTest, ResultSerialization) {
+ // Setup a random CompactionServiceResult
+ CompactionServiceResult result;
+ const int kStrMaxLen = 1000;
+ Random rnd(static_cast<uint32_t>(time(nullptr)));
+ Random64 rnd64(time(nullptr));
+ std::vector<Status> status_list = {
+ Status::OK(),
+ Status::InvalidArgument("invalid option"),
+ Status::Aborted("failed to run"),
+ Status::NotSupported("not supported option"),
+ };
+ result.status =
+ status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+ while (!rnd.OneIn(10)) {
+ UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
+ result.output_files.emplace_back(
+ rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
+ rnd64.Uniform(UINT64_MAX),
+ rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+ rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+ rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
+ rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
+ }
+ result.output_level = rnd.Uniform(10);
+ result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+ result.num_output_records = rnd64.Uniform(UINT64_MAX);
+ result.total_bytes = rnd64.Uniform(UINT64_MAX);
+ result.bytes_read = 123;
+ result.bytes_written = rnd64.Uniform(UINT64_MAX);
+ result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
+ result.stats.num_output_files = rnd.Uniform(1000);
+ result.stats.is_full_compaction = rnd.OneIn(2);
+ result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX);
+ result.stats.num_input_files = 9;
+
+ std::string output;
+ ASSERT_OK(result.Write(&output));
+
+ // Test deserialization
+ CompactionServiceResult deserialized1;
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
+ ASSERT_TRUE(deserialized1.TEST_Equals(&result));
+
+ // Test mismatch
+ deserialized1.stats.num_input_files += 10;
+ std::string mismatch;
+ ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
+ ASSERT_EQ(mismatch, "stats.num_input_files");
+
+ // Test unique id mismatch
+ if (!result.output_files.empty()) {
+ CompactionServiceResult deserialized_tmp;
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp));
+ deserialized_tmp.output_files[0].unique_id[0] += 1;
+ ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
+ ASSERT_EQ(mismatch, "output_files.unique_id");
+ deserialized_tmp.status.PermitUncheckedError();
+ }
+
+ // Test unknown field
+ CompactionServiceResult deserialized2;
+ output.clear();
+ ASSERT_OK(result.Write(&output));
+ output.append("new_field=123;");
+
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2));
+ ASSERT_TRUE(deserialized2.TEST_Equals(&result));
+
+ // Test missing field
+ CompactionServiceResult deserialized3;
+ deserialized3.bytes_read = 0;
+ std::string to_remove = "bytes_read=123;";
+ size_t pos = output.find(to_remove);
+ ASSERT_TRUE(pos != std::string::npos);
+ output.erase(pos, to_remove.length());
+ ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3));
+ mismatch.clear();
+ ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch));
+ ASSERT_EQ(mismatch, "bytes_read");
+
+ deserialized3.bytes_read = 123;
+ ASSERT_TRUE(deserialized3.TEST_Equals(&result));
+
+ // Test invalid version
+ output.clear();
+ ASSERT_OK(result.Write(&output));
+
+ uint32_t data_version = DecodeFixed32(output.data());
+ const size_t kDataVersionSize = sizeof(data_version);
+ ASSERT_EQ(data_version,
+ 1U); // Update once the default data version is changed
+ char buf[kDataVersionSize];
+ EncodeFixed32(buf, data_version + 10); // make sure it's not valid
+ output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+ Status s = CompactionServiceResult::Read(output, &deserialized3);
+ ASSERT_TRUE(s.IsNotSupported());
+ for (const auto& item : status_list) {
+ item.PermitUncheckedError();
+ }
+}
+
+class CompactionJobDynamicFileSizeTest
+ : public CompactionJobTestBase,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ CompactionJobDynamicFileSizeTest()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_dynamic_file_size_test"),
+ BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+ /*test_io_priority=*/false, TableTypeForTest::kMockTable) {}
+};
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) {
+ // dynamic_file_size option should have no impact on cutting for max
+ // compaction bytes.
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+ NewDB();
+ mutable_cf_options_.target_file_size_base = 80;
+ mutable_cf_options_.max_compaction_bytes = 21;
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("c", 5U, kTypeValue), "val2"},
+ {KeyStr("n", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"},
+ {KeyStr("j", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ // Create three L2 files, each size 10.
+ // max_compaction_bytes 21 means the compaction output in L1 will
+ // be cut to at least two files.
+ auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+ {KeyStr("c", 1U, kTypeValue), "val"},
+ {KeyStr("c1", 1U, kTypeValue), "val"},
+ {KeyStr("c2", 1U, kTypeValue), "val"},
+ {KeyStr("c3", 1U, kTypeValue), "val"},
+ {KeyStr("c4", 1U, kTypeValue), "val"},
+ {KeyStr("d", 1U, kTypeValue), "val"},
+ {KeyStr("e", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+ {KeyStr("i", 1U, kTypeValue), "val"},
+ {KeyStr("i1", 1U, kTypeValue), "val"},
+ {KeyStr("i2", 1U, kTypeValue), "val"},
+ {KeyStr("i3", 1U, kTypeValue), "val"},
+ {KeyStr("i4", 1U, kTypeValue), "val"},
+ {KeyStr("j", 1U, kTypeValue), "val"},
+ {KeyStr("k", 2U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"},
+ {KeyStr("m", 1U, kTypeValue), "val"},
+ {KeyStr("m1", 1U, kTypeValue), "val"},
+ {KeyStr("m2", 1U, kTypeValue), "val"},
+ {KeyStr("m3", 1U, kTypeValue), "val"},
+ {KeyStr("m4", 1U, kTypeValue), "val"},
+ {KeyStr("n", 1U, kTypeValue), "val"},
+ {KeyStr("o", 2U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ // The expected output should be:
+ // L1: [c, h, j] [n]
+ // L2: [b ... e] [h ... k] [l ... o]
+ // It's better to have "j" in the first file, because anyway it's overlapping
+ // with the second file on L2.
+ // (Note: before this PR, it was cut at "h" because it's using the internal
+ // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with
+ // seqno 1, but actually they're overlapped with the compaction picker).
+
+ auto expected_file1 =
+ mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"},
+ {KeyStr("h", 3U, kTypeValue), "val"},
+ {KeyStr("j", 4U, kTypeValue), "val"}});
+ auto expected_file2 =
+ mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2});
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) {
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+ NewDB();
+ // Make sure the grandparent level file size (10) qualifies skipping.
+ // Currently, it has to be > 1/8 of target file size.
+ mutable_cf_options_.target_file_size_base = 70;
+
+ auto file1 = mock::MakeMockFile({
+ {KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("z", 6U, kTypeValue), "val3"},
+ });
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"},
+ {KeyStr("x", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+ {KeyStr("d", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+ {KeyStr("i", 2U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"},
+ {KeyStr("y", 2U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ auto expected_file1 =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("c", 3U, kTypeValue), "val"}});
+ auto expected_file2 =
+ mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"},
+ {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+ auto expected_file_disable_dynamic_file_size =
+ mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+ {KeyStr("c", 3U, kTypeValue), "val"},
+ {KeyStr("x", 4U, kTypeValue), "val"},
+ {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+ SetLastSequence(6U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+ if (enable_dyanmic_file_size) {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2});
+ } else {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file_disable_dynamic_file_size});
+ }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) {
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+ NewDB();
+
+ // MockTable has 1 byte per entry by default and each file is 10 bytes.
+ // When the file size is smaller than 100, it won't cut file earlier to align
+ // with its grandparent boundary.
+ const size_t kKeyValueSize = 10000;
+ mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+ mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+ mock::KVVector file1;
+ char ch = 'd';
+ // Add value from d -> o
+ for (char i = 0; i < 12; i++) {
+ file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"},
+ {KeyStr("s", 4U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ // the 1st grandparent file should be skipped
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+ {KeyStr("b", 2U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"},
+ {KeyStr("e", 2U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+ {KeyStr("j", 2U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"},
+ {KeyStr("n", 2U, kTypeValue), "val"}});
+ AddMockFile(file6, 2);
+
+ auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"},
+ {KeyStr("t", 2U, kTypeValue), "val"}});
+ AddMockFile(file7, 2);
+
+ // The expected outputs are:
+ // L1: [d,e,f,g,h,i,j] [k,l,m,n,o,s]
+ // L2: [a, b] [c, e] [h, j] [k, n] [q, t]
+ // The first output cut earlier at "j", so it could be aligned with L2 files.
+ // If dynamic_file_size is not enabled, it will be cut based on the
+ // target_file_size
+ mock::KVVector expected_file1;
+ for (char i = 0; i < 7; i++) {
+ expected_file1.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ mock::KVVector expected_file2;
+ for (char i = 7; i < 12; i++) {
+ expected_file2.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+ expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val");
+
+ mock::KVVector expected_file_disable_dynamic_file_size1;
+ for (char i = 0; i < 10; i++) {
+ expected_file_disable_dynamic_file_size1.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ mock::KVVector expected_file_disable_dynamic_file_size2;
+ for (char i = 10; i < 12; i++) {
+ expected_file_disable_dynamic_file_size2.emplace_back(
+ KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+ "val" + std::to_string(i));
+ }
+
+ expected_file_disable_dynamic_file_size2.emplace_back(
+ KeyStr("s", 4U, kTypeValue), "val");
+
+ SetLastSequence(22U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+ if (enable_dyanmic_file_size) {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2});
+ } else {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file_disable_dynamic_file_size1,
+ expected_file_disable_dynamic_file_size2});
+ }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) {
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+ NewDB();
+
+ // MockTable has 1 byte per entry by default and each file is 10 bytes.
+ // When the file size is smaller than 100, it won't cut file earlier to align
+ // with its grandparent boundary.
+ const size_t kKeyValueSize = 10000;
+ mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+ mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+ mock::KVVector file1;
+ for (int i = 0; i < 7; i++) {
+ file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+ "val" + std::to_string(100 - i));
+ }
+ file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb");
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"},
+ {KeyStr("b", 90U, kTypeValue), "valb"}});
+ AddMockFile(file2, 1);
+
+ auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"},
+ {KeyStr("a", 88U, kTypeValue), "val"}});
+ AddMockFile(file3, 2);
+
+ auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"},
+ {KeyStr("a", 86U, kTypeValue), "val"}});
+ AddMockFile(file4, 2);
+
+ auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"},
+ {KeyStr("b", 84U, kTypeValue), "val"}});
+ AddMockFile(file5, 2);
+
+ mock::KVVector expected_file1;
+ mock::KVVector expected_file_disable_dynamic_file_size;
+
+ for (int i = 0; i < 8; i++) {
+ expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+ "val" + std::to_string(100 - i));
+ expected_file_disable_dynamic_file_size.emplace_back(
+ KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i));
+ }
+
+ // make sure `b` is cut in a separated file (so internally it's not using
+ // internal comparator, which will think the "b:90" (seqno 90) here is smaller
+ // than "b:85" on L2.)
+ auto expected_file2 =
+ mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}});
+
+ expected_file_disable_dynamic_file_size.emplace_back(
+ KeyStr("b", 90U, kTypeValue), "valb");
+
+ SetLastSequence(122U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+ // Just keep all the history
+ std::vector<SequenceNumber> snapshots;
+ for (int i = 80; i <= 100; i++) {
+ snapshots.emplace_back(i);
+ }
+ if (enable_dyanmic_file_size) {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2}, snapshots);
+ } else {
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file_disable_dynamic_file_size}, snapshots);
+ }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) {
+ // dynamic_file_size option should have no impact on cutting for max
+ // compaction bytes.
+ bool enable_dyanmic_file_size = GetParam();
+ cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+ NewDB();
+ mutable_cf_options_.target_file_size_base = 80;
+ mutable_cf_options_.max_compaction_bytes = 20;
+
+ auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+ {KeyStr("b", 103U, kTypeValue), "val"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"},
+ {KeyStr("c", 101U, kTypeValue), "val"}});
+ AddMockFile(file2, 1);
+
+ for (int i = 0; i < 10; i++) {
+ auto file =
+ mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"},
+ {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}});
+ AddMockFile(file, 2);
+ }
+
+ for (int i = 0; i < 10; i++) {
+ auto file =
+ mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"},
+ {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}});
+ AddMockFile(file, 2);
+ }
+
+ auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"},
+ {KeyStr("c", 59U, kTypeValue), "valc"}});
+
+ // "a" has 10 overlapped grandparent files (each size 10), which is far
+ // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated,
+ // as splitting them won't help reducing the compaction size.
+ // also make sure "b" and "c" are cut separately.
+ mock::KVVector expected_file1 =
+ mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+ {KeyStr("a", 102U, kTypeValue), "val2"}});
+ mock::KVVector expected_file2 =
+ mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}});
+ mock::KVVector expected_file3 =
+ mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}});
+
+ SetLastSequence(122U);
+ const std::vector<int> input_levels = {0, 1};
+ auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+ auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+ // Just keep all the history
+ std::vector<SequenceNumber> snapshots;
+ for (int i = 80; i <= 105; i++) {
+ snapshots.emplace_back(i);
+ }
+ RunCompaction({lvl0_files, lvl1_files}, input_levels,
+ {expected_file1, expected_file2, expected_file3}, snapshots);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest,
+ CompactionJobDynamicFileSizeTest, testing::Bool());
+
+class CompactionJobTimestampTest : public CompactionJobTestBase {
+ public:
+ CompactionJobTimestampTest()
+ : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"),
+ test::BytewiseComparatorWithU64TsWrapper(),
+ test::EncodeInt, /*test_io_priority=*/false,
+ TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTimestampTest, GCDisabled) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+ {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+ {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+ {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}});
+
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+ {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+ {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+ {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+ AddMockFile(file2);
+
+ SetLastSequence(10);
+
+ auto expected_results = mock::MakeMockFile(
+ {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+ {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+ {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+ {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+ {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+ {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+ {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"},
+ {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, NoKeyExpired) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+ {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+ {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}});
+ AddMockFile(file1);
+
+ auto file2 =
+ mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+ {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+ AddMockFile(file2);
+
+ SetLastSequence(101);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+ {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+ {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+ {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"},
+ {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ full_history_ts_low_ = encode_u64_ts_(0);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, AllKeysExpired) {
+ NewDB();
+
+ auto file1 = mock::MakeMockFile(
+ {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""},
+ {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""},
+ {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"},
+ {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"},
+ {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+ {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}});
+ AddMockFile(file2);
+
+ SetLastSequence(7);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max());
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, SomeKeysExpired) {
+ NewDB();
+
+ auto file1 =
+ mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+ {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+ AddMockFile(file1);
+
+ auto file2 = mock::MakeMockFile(
+ {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"},
+ {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"},
+ {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}});
+ AddMockFile(file2);
+
+ SetLastSequence(6);
+
+ auto expected_results =
+ mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+ {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"},
+ {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+ constexpr int input_level = 0;
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ full_history_ts_low_ = encode_u64_ts_(49);
+ RunCompaction({files}, {input_level}, {expected_results});
+}
+
+class CompactionJobTimestampTestWithBbTable : public CompactionJobTestBase {
+ public:
+ // Block-based table is needed if we want to test subcompaction partitioning
+ // with anchors.
+ explicit CompactionJobTimestampTestWithBbTable()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_ts_bbt_test"),
+ test::BytewiseComparatorWithU64TsWrapper(), test::EncodeInt,
+ /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionAnchorL1) {
+ cf_options_.target_file_size_base = 20;
+ mutable_cf_options_.target_file_size_base = 20;
+ NewDB();
+
+ const std::vector<std::string> keys = {
+ KeyStr("a", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 21, ValueType::kTypeValue, 210),
+ KeyStr("b", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 18, ValueType::kTypeValue, 180),
+ KeyStr("c", 17, ValueType::kTypeValue, 170),
+ KeyStr("c", 16, ValueType::kTypeValue, 160),
+ KeyStr("c", 15, ValueType::kTypeValue, 150)};
+ const std::vector<std::string> values = {"a20", "b21", "b20", "b18",
+ "c17", "c16", "c15"};
+
+ constexpr int input_level = 1;
+
+ auto file1 = mock::MakeMockFile(
+ {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+ AddMockFile(file1, input_level);
+
+ auto file2 = mock::MakeMockFile(
+ {{keys[3], values[3]}, {keys[4], values[4]}, {keys[5], values[5]}});
+ AddMockFile(file2, input_level);
+
+ auto file3 = mock::MakeMockFile({{keys[6], values[6]}});
+ AddMockFile(file3, input_level);
+
+ SetLastSequence(20);
+
+ auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+ auto output2 = mock::MakeMockFile(
+ {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+ auto output3 = mock::MakeMockFile(
+ {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+ auto expected_results =
+ std::vector<mock::KVVector>{output1, output2, output3};
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ constexpr int output_level = 2;
+ constexpr int max_subcompactions = 4;
+ RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+ /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+ max_subcompactions);
+}
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionL0) {
+ cf_options_.target_file_size_base = 20;
+ mutable_cf_options_.target_file_size_base = 20;
+ NewDB();
+
+ const std::vector<std::string> keys = {
+ KeyStr("a", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 20, ValueType::kTypeValue, 200),
+ KeyStr("b", 19, ValueType::kTypeValue, 190),
+ KeyStr("b", 18, ValueType::kTypeValue, 180),
+ KeyStr("c", 17, ValueType::kTypeValue, 170),
+ KeyStr("c", 16, ValueType::kTypeValue, 160),
+ KeyStr("c", 15, ValueType::kTypeValue, 150)};
+ const std::vector<std::string> values = {"a20", "b20", "b19", "b18",
+ "c17", "c16", "c15"};
+
+ constexpr int input_level = 0;
+
+ auto file1 = mock::MakeMockFile({{keys[5], values[5]}, {keys[6], values[6]}});
+ AddMockFile(file1, input_level);
+
+ auto file2 = mock::MakeMockFile({{keys[3], values[3]}, {keys[4], values[4]}});
+ AddMockFile(file2, input_level);
+
+ auto file3 = mock::MakeMockFile(
+ {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+ AddMockFile(file3, input_level);
+
+ SetLastSequence(20);
+
+ auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+ auto output2 = mock::MakeMockFile(
+ {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+ auto output3 = mock::MakeMockFile(
+ {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+ auto expected_results =
+ std::vector<mock::KVVector>{output1, output2, output3};
+ const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+ constexpr int output_level = 1;
+ constexpr int max_subcompactions = 4;
+ RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+ /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+ output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+ /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+ max_subcompactions);
+}
+
+// The io priority of the compaction reads and writes are different from
+// other DB reads and writes. To prepare the compaction input files, use the
+// default filesystem from Env. To test the io priority of the compaction
+// reads and writes, db_options_.fs is set as MockTestFileSystem.
+class CompactionJobIOPriorityTest : public CompactionJobTestBase {
+ public:
+ CompactionJobIOPriorityTest()
+ : CompactionJobTestBase(
+ test::PerThreadDBPath("compaction_job_io_priority_test"),
+ BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+ /*test_io_priority=*/true, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateNormal) {
+ // When the state from WriteController is normal.
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+ Env::IO_LOW, Env::IO_LOW);
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateDelayed) {
+ // When the state from WriteController is Delayed.
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ {
+ std::unique_ptr<WriteControllerToken> delay_token =
+ write_controller_.GetDelayToken(1000000);
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+ Env::IO_USER, Env::IO_USER);
+ }
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateStalled) {
+ // When the state from WriteController is Stalled.
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ {
+ std::unique_ptr<WriteControllerToken> stop_token =
+ write_controller_.GetStopToken();
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+ Env::IO_USER, Env::IO_USER);
+ }
+}
+
+TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
+ NewDB();
+ mock::KVVector expected_results = CreateTwoFiles(false);
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+ constexpr int input_level = 0;
+ auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+ ASSERT_EQ(2U, files.size());
+ RunCompaction({files}, {input_level}, {expected_results}, {},
+ kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true,
+ Env::IO_LOW, Env::IO_LOW);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.cc b/src/rocksdb/db/compaction/compaction_outputs.cc
new file mode 100644
index 000000000..e74378e2a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.cc
@@ -0,0 +1,646 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_outputs.h"
+
+#include "db/builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
+ builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
+}
+
+Status CompactionOutputs::Finish(const Status& intput_status,
+ const SeqnoToTimeMapping& seqno_time_mapping) {
+ FileMetaData* meta = GetMetaData();
+ assert(meta != nullptr);
+ Status s = intput_status;
+ if (s.ok()) {
+ std::string seqno_time_mapping_str;
+ seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno,
+ meta->fd.largest_seqno, meta->file_creation_time);
+ builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str,
+ meta->oldest_ancester_time);
+ s = builder_->Finish();
+
+ } else {
+ builder_->Abandon();
+ }
+ Status io_s = builder_->io_status();
+ if (s.ok()) {
+ s = io_s;
+ } else {
+ io_s.PermitUncheckedError();
+ }
+ const uint64_t current_bytes = builder_->FileSize();
+ if (s.ok()) {
+ meta->fd.file_size = current_bytes;
+ meta->marked_for_compaction = builder_->NeedCompact();
+ }
+ current_output().finished = true;
+ stats_.bytes_written += current_bytes;
+ stats_.num_output_files = outputs_.size();
+
+ return s;
+}
+
+IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
+ SystemClock* clock,
+ Statistics* statistics,
+ bool use_fsync) {
+ IOStatus io_s;
+ if (input_status.ok()) {
+ StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
+ io_s = file_writer_->Sync(use_fsync);
+ }
+ if (input_status.ok() && io_s.ok()) {
+ io_s = file_writer_->Close();
+ }
+
+ if (input_status.ok() && io_s.ok()) {
+ FileMetaData* meta = GetMetaData();
+ meta->file_checksum = file_writer_->GetFileChecksum();
+ meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
+ }
+
+ file_writer_.reset();
+
+ return io_s;
+}
+
+size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
+ const Slice& internal_key) {
+ size_t curr_key_boundary_switched_num = 0;
+ const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+
+ if (grandparents.empty()) {
+ return curr_key_boundary_switched_num;
+ }
+ assert(!internal_key.empty());
+ InternalKey ikey;
+ ikey.DecodeFrom(internal_key);
+ assert(ikey.Valid());
+
+ const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+
+ // Move the grandparent_index_ to the file containing the current user_key.
+ // If there are multiple files containing the same user_key, make sure the
+ // index points to the last file containing the key.
+ while (grandparent_index_ < grandparents.size()) {
+ if (being_grandparent_gap_) {
+ if (sstableKeyCompare(ucmp, ikey,
+ grandparents[grandparent_index_]->smallest) < 0) {
+ break;
+ }
+ if (seen_key_) {
+ curr_key_boundary_switched_num++;
+ grandparent_overlapped_bytes_ +=
+ grandparents[grandparent_index_]->fd.GetFileSize();
+ grandparent_boundary_switched_num_++;
+ }
+ being_grandparent_gap_ = false;
+ } else {
+ int cmp_result = sstableKeyCompare(
+ ucmp, ikey, grandparents[grandparent_index_]->largest);
+ // If it's same key, make sure grandparent_index_ is pointing to the last
+ // one.
+ if (cmp_result < 0 ||
+ (cmp_result == 0 &&
+ (grandparent_index_ == grandparents.size() - 1 ||
+ sstableKeyCompare(ucmp, ikey,
+ grandparents[grandparent_index_ + 1]->smallest) <
+ 0))) {
+ break;
+ }
+ if (seen_key_) {
+ curr_key_boundary_switched_num++;
+ grandparent_boundary_switched_num_++;
+ }
+ being_grandparent_gap_ = true;
+ grandparent_index_++;
+ }
+ }
+
+ // If the first key is in the middle of a grandparent file, adding it to the
+ // overlap
+ if (!seen_key_ && !being_grandparent_gap_) {
+ assert(grandparent_overlapped_bytes_ == 0);
+ grandparent_overlapped_bytes_ =
+ GetCurrentKeyGrandparentOverlappedBytes(internal_key);
+ }
+
+ seen_key_ = true;
+ return curr_key_boundary_switched_num;
+}
+
+uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
+ const Slice& internal_key) const {
+ // no overlap with any grandparent file
+ if (being_grandparent_gap_) {
+ return 0;
+ }
+ uint64_t overlapped_bytes = 0;
+
+ const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+ const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+ InternalKey ikey;
+ ikey.DecodeFrom(internal_key);
+#ifndef NDEBUG
+ // make sure the grandparent_index_ is pointing to the last files containing
+ // the current key.
+ int cmp_result =
+ sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest);
+ assert(
+ cmp_result < 0 ||
+ (cmp_result == 0 &&
+ (grandparent_index_ == grandparents.size() - 1 ||
+ sstableKeyCompare(
+ ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0)));
+ assert(sstableKeyCompare(ucmp, ikey,
+ grandparents[grandparent_index_]->smallest) >= 0);
+#endif
+ overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize();
+
+ // go backwards to find all overlapped files, one key can overlap multiple
+ // files. In the following example, if the current output key is `c`, and one
+ // compaction file was cut before `c`, current `c` can overlap with 3 files:
+ // [a b] [c...
+ // [b, b] [c, c] [c, c] [c, d]
+ for (int64_t i = static_cast<int64_t>(grandparent_index_) - 1;
+ i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0;
+ i--) {
+ overlapped_bytes += grandparents[i]->fd.GetFileSize();
+ }
+
+ return overlapped_bytes;
+}
+
+bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
+ assert(c_iter.Valid());
+
+ // always update grandparent information like overlapped file number, size
+ // etc.
+ const Slice& internal_key = c_iter.key();
+ const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
+ size_t num_grandparent_boundaries_crossed =
+ UpdateGrandparentBoundaryInfo(internal_key);
+
+ if (!HasBuilder()) {
+ return false;
+ }
+
+ // If there's user defined partitioner, check that first
+ if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
+ last_key_for_partitioner_, c_iter.user_key(),
+ current_output_file_size_)) == kRequired) {
+ return true;
+ }
+
+ // files output to Level 0 won't be split
+ if (compaction_->output_level() == 0) {
+ return false;
+ }
+
+ // reach the max file size
+ if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+ return true;
+ }
+
+ const InternalKeyComparator* icmp =
+ &compaction_->column_family_data()->internal_comparator();
+
+ // Check if it needs to split for RoundRobin
+ // Invalid local_output_split_key indicates that we do not need to split
+ if (local_output_split_key_ != nullptr && !is_split_) {
+ // Split occurs when the next key is larger than/equal to the cursor
+ if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) {
+ is_split_ = true;
+ return true;
+ }
+ }
+
+ // only check if the current key is going to cross the grandparents file
+ // boundary (either the file beginning or ending).
+ if (num_grandparent_boundaries_crossed > 0) {
+ // Cut the file before the current key if the size of the current output
+ // file + its overlapped grandparent files is bigger than
+ // max_compaction_bytes. Which is to prevent future bigger than
+ // max_compaction_bytes compaction from the current output level.
+ if (grandparent_overlapped_bytes_ + current_output_file_size_ >
+ compaction_->max_compaction_bytes()) {
+ return true;
+ }
+
+ // Cut the file if including the key is going to add a skippable file on
+ // the grandparent level AND its size is reasonably big (1/8 of target file
+ // size). For example, if it's compacting the files L0 + L1:
+ // L0: [1, 21]
+ // L1: [3, 23]
+ // L2: [2, 4] [11, 15] [22, 24]
+ // Without this break, it will output as:
+ // L1: [1,3, 21,23]
+ // With this break, it will output as (assuming [11, 15] at L2 is bigger
+ // than 1/8 of target size):
+ // L1: [1,3] [21,23]
+ // Then for the future compactions, [11,15] won't be included.
+ // For random datasets (either evenly distributed or skewed), it rarely
+ // triggers this condition, but if the user is adding 2 different datasets
+ // without any overlap, it may likely happen.
+ // More details, check PR #1963
+ const size_t num_skippable_boundaries_crossed =
+ being_grandparent_gap_ ? 2 : 3;
+ if (compaction_->immutable_options()->compaction_style ==
+ kCompactionStyleLevel &&
+ compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+ num_grandparent_boundaries_crossed >=
+ num_skippable_boundaries_crossed &&
+ grandparent_overlapped_bytes_ - previous_overlapped_bytes >
+ compaction_->target_output_file_size() / 8) {
+ return true;
+ }
+
+ // Pre-cut the output file if it's reaching a certain size AND it's at the
+ // boundary of a grandparent file. It can reduce the future compaction size,
+ // the cost is having smaller files.
+ // The pre-cut size threshold is based on how many grandparent boundaries
+ // it has seen before. Basically, if it has seen no boundary at all, then it
+ // will pre-cut at 50% target file size. Every boundary it has seen
+ // increases the threshold by 5%, max at 90%, which it will always cut.
+ // The idea is based on if it has seen more boundaries before, it will more
+ // likely to see another boundary (file cutting opportunity) before the
+ // target file size. The test shows it can generate larger files than a
+ // static threshold like 75% and has a similar write amplification
+ // improvement.
+ if (compaction_->immutable_options()->compaction_style ==
+ kCompactionStyleLevel &&
+ compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+ current_output_file_size_ >=
+ ((compaction_->target_output_file_size() + 99) / 100) *
+ (50 + std::min(grandparent_boundary_switched_num_ * 5,
+ size_t{40}))) {
+ return true;
+ }
+ }
+
+ // check ttl file boundaries if there's any
+ if (!files_to_cut_for_ttl_.empty()) {
+ if (cur_files_to_cut_for_ttl_ != -1) {
+ // Previous key is inside the range of a file
+ if (icmp->Compare(internal_key,
+ files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
+ ->largest.Encode()) > 0) {
+ next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
+ cur_files_to_cut_for_ttl_ = -1;
+ return true;
+ }
+ } else {
+ // Look for the key position
+ while (next_files_to_cut_for_ttl_ <
+ static_cast<int>(files_to_cut_for_ttl_.size())) {
+ if (icmp->Compare(internal_key,
+ files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+ ->smallest.Encode()) >= 0) {
+ if (icmp->Compare(internal_key,
+ files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+ ->largest.Encode()) <= 0) {
+ // With in the current file
+ cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
+ return true;
+ }
+ // Beyond the current file
+ next_files_to_cut_for_ttl_++;
+ } else {
+ // Still fall into the gap
+ break;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+Status CompactionOutputs::AddToOutput(
+ const CompactionIterator& c_iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ Status s;
+ const Slice& key = c_iter.key();
+
+ if (ShouldStopBefore(c_iter) && HasBuilder()) {
+ s = close_file_func(*this, c_iter.InputStatus(), key);
+ if (!s.ok()) {
+ return s;
+ }
+ // reset grandparent information
+ grandparent_boundary_switched_num_ = 0;
+ grandparent_overlapped_bytes_ =
+ GetCurrentKeyGrandparentOverlappedBytes(key);
+ }
+
+ // Open output file if necessary
+ if (!HasBuilder()) {
+ s = open_file_func(*this);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ assert(builder_ != nullptr);
+ const Slice& value = c_iter.value();
+ s = current_output().validator.Add(key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ builder_->Add(key, value);
+
+ stats_.num_output_records++;
+ current_output_file_size_ = builder_->EstimatedFileSize();
+
+ if (blob_garbage_meter_) {
+ s = blob_garbage_meter_->ProcessOutFlow(key, value);
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ const ParsedInternalKey& ikey = c_iter.ikey();
+ s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
+ ikey.type);
+
+ if (partitioner_) {
+ last_key_for_partitioner_.assign(c_iter.user_key().data_,
+ c_iter.user_key().size_);
+ }
+
+ return s;
+}
+
+Status CompactionOutputs::AddRangeDels(
+ const Slice* comp_start_user_key, const Slice* comp_end_user_key,
+ CompactionIterationStats& range_del_out_stats, bool bottommost_level,
+ const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
+ const Slice& next_table_min_key, const std::string& full_history_ts_low) {
+ assert(HasRangeDel());
+ FileMetaData& meta = current_output().meta;
+ const Comparator* ucmp = icmp.user_comparator();
+
+ Slice lower_bound_guard, upper_bound_guard;
+ std::string smallest_user_key;
+ const Slice *lower_bound, *upper_bound;
+ bool lower_bound_from_sub_compact = false;
+
+ size_t output_size = outputs_.size();
+ if (output_size == 1) {
+ // For the first output table, include range tombstones before the min
+ // key but after the subcompaction boundary.
+ lower_bound = comp_start_user_key;
+ lower_bound_from_sub_compact = true;
+ } else if (meta.smallest.size() > 0) {
+ // For subsequent output tables, only include range tombstones from min
+ // key onwards since the previous file was extended to contain range
+ // tombstones falling before min key.
+ smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
+ lower_bound_guard = Slice(smallest_user_key);
+ lower_bound = &lower_bound_guard;
+ } else {
+ lower_bound = nullptr;
+ }
+ if (!next_table_min_key.empty()) {
+ // This may be the last file in the subcompaction in some cases, so we
+ // need to compare the end key of subcompaction with the next file start
+ // key. When the end key is chosen by the subcompaction, we know that
+ // it must be the biggest key in output file. Therefore, it is safe to
+ // use the smaller key as the upper bound of the output file, to ensure
+ // that there is no overlapping between different output files.
+ upper_bound_guard = ExtractUserKey(next_table_min_key);
+ if (comp_end_user_key != nullptr &&
+ ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
+ 0) {
+ upper_bound = comp_end_user_key;
+ } else {
+ upper_bound = &upper_bound_guard;
+ }
+ } else {
+ // This is the last file in the subcompaction, so extend until the
+ // subcompaction ends.
+ upper_bound = comp_end_user_key;
+ }
+ bool has_overlapping_endpoints;
+ if (upper_bound != nullptr && meta.largest.size() > 0) {
+ has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
+ meta.largest.user_key(), *upper_bound) == 0;
+ } else {
+ has_overlapping_endpoints = false;
+ }
+
+ // The end key of the subcompaction must be bigger or equal to the upper
+ // bound. If the end of subcompaction is null or the upper bound is null,
+ // it means that this file is the last file in the compaction. So there
+ // will be no overlapping between this file and others.
+ assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
+ ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
+ auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
+ has_overlapping_endpoints);
+ // Position the range tombstone output iterator. There may be tombstone
+ // fragments that are entirely out of range, so make sure that we do not
+ // include those.
+ if (lower_bound != nullptr) {
+ it->Seek(*lower_bound);
+ } else {
+ it->SeekToFirst();
+ }
+ for (; it->Valid(); it->Next()) {
+ auto tombstone = it->Tombstone();
+ if (upper_bound != nullptr) {
+ int cmp =
+ ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
+ if ((has_overlapping_endpoints && cmp < 0) ||
+ (!has_overlapping_endpoints && cmp <= 0)) {
+ // Tombstones starting after upper_bound only need to be included in
+ // the next table. If the current SST ends before upper_bound, i.e.,
+ // `has_overlapping_endpoints == false`, we can also skip over range
+ // tombstones that start exactly at upper_bound. Such range
+ // tombstones will be included in the next file and are not relevant
+ // to the point keys or endpoints of the current file.
+ break;
+ }
+ }
+
+ const size_t ts_sz = ucmp->timestamp_size();
+ // Garbage collection for range tombstones.
+ // If user-defined timestamp is enabled, range tombstones are dropped if
+ // they are at bottommost_level, below full_history_ts_low and not visible
+ // in any snapshot. trim_ts_ is passed to the constructor for
+ // range_del_agg_, and range_del_agg_ internally drops tombstones above
+ // trim_ts_.
+ if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
+ (ts_sz == 0 ||
+ (!full_history_ts_low.empty() &&
+ ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
+ // TODO(andrewkr): tombstones that span multiple output files are
+ // counted for each compaction output file, so lots of double
+ // counting.
+ range_del_out_stats.num_range_del_drop_obsolete++;
+ range_del_out_stats.num_record_drop_obsolete++;
+ continue;
+ }
+
+ auto kv = tombstone.Serialize();
+ assert(lower_bound == nullptr ||
+ ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
+ // Range tombstone is not supported by output validator yet.
+ builder_->Add(kv.first.Encode(), kv.second);
+ InternalKey smallest_candidate = std::move(kv.first);
+ if (lower_bound != nullptr &&
+ ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
+ *lower_bound) <= 0) {
+ // Pretend the smallest key has the same user key as lower_bound
+ // (the max key in the previous table or subcompaction) in order for
+ // files to appear key-space partitioned.
+ //
+ // When lower_bound is chosen by a subcompaction, we know that
+ // subcompactions over smaller keys cannot contain any keys at
+ // lower_bound. We also know that smaller subcompactions exist,
+ // because otherwise the subcompaction woud be unbounded on the left.
+ // As a result, we know that no other files on the output level will
+ // contain actual keys at lower_bound (an output file may have a
+ // largest key of lower_bound@kMaxSequenceNumber, but this only
+ // indicates a large range tombstone was truncated). Therefore, it is
+ // safe to use the tombstone's sequence number, to ensure that keys at
+ // lower_bound at lower levels are covered by truncated tombstones.
+ //
+ // If lower_bound was chosen by the smallest data key in the file,
+ // choose lowest seqnum so this file's smallest internal key comes
+ // after the previous file's largest. The fake seqnum is OK because
+ // the read path's file-picking code only considers user key.
+ if (lower_bound_from_sub_compact) {
+ if (ts_sz) {
+ assert(tombstone.ts_.size() == ts_sz);
+ smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
+ kTypeRangeDeletion, tombstone.ts_);
+ } else {
+ smallest_candidate =
+ InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
+ }
+ } else {
+ smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
+ }
+ }
+ InternalKey largest_candidate = tombstone.SerializeEndKey();
+ if (upper_bound != nullptr &&
+ ucmp->CompareWithoutTimestamp(*upper_bound,
+ largest_candidate.user_key()) <= 0) {
+ // Pretend the largest key has the same user key as upper_bound (the
+ // min key in the following table or subcompaction) in order for files
+ // to appear key-space partitioned.
+ //
+ // Choose highest seqnum so this file's largest internal key comes
+ // before the next file's/subcompaction's smallest. The fake seqnum is
+ // OK because the read path's file-picking code only considers the
+ // user key portion.
+ //
+ // Note Seek() also creates InternalKey with (user_key,
+ // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+ // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+ // Seek() key in InternalKey's ordering. So Seek() will look in the
+ // next file for the user key
+ if (ts_sz) {
+ static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+ if (ts_sz <= strlen(kTsMax)) {
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+ Slice(kTsMax, ts_sz));
+ } else {
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+ std::string(ts_sz, '\xff'));
+ }
+ } else {
+ largest_candidate =
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+ }
+ }
+#ifndef NDEBUG
+ SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+ if (meta.smallest.size() > 0) {
+ smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
+ }
+#endif
+ meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+ tombstone.seq_, icmp);
+ // The smallest key in a file is used for range tombstone truncation, so
+ // it cannot have a seqnum of 0 (unless the smallest data key in a file
+ // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+ // deleted keys at lower levels.
+ assert(smallest_ikey_seqnum == 0 ||
+ ExtractInternalKeyFooter(meta.smallest.Encode()) !=
+ PackSequenceAndType(0, kTypeRangeDeletion));
+ }
+ return Status::OK();
+}
+
+void CompactionOutputs::FillFilesToCutForTtl() {
+ if (compaction_->immutable_options()->compaction_style !=
+ kCompactionStyleLevel ||
+ compaction_->immutable_options()->compaction_pri !=
+ kMinOverlappingRatio ||
+ compaction_->mutable_cf_options()->ttl == 0 ||
+ compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) {
+ return;
+ }
+
+ // We define new file with the oldest ancestor time to be younger than 1/4
+ // TTL, and an old one to be older than 1/2 TTL time.
+ int64_t temp_current_time;
+ auto get_time_status =
+ compaction_->immutable_options()->clock->GetCurrentTime(
+ &temp_current_time);
+ if (!get_time_status.ok()) {
+ return;
+ }
+
+ auto current_time = static_cast<uint64_t>(temp_current_time);
+ if (current_time < compaction_->mutable_cf_options()->ttl) {
+ return;
+ }
+
+ uint64_t old_age_thres =
+ current_time - compaction_->mutable_cf_options()->ttl / 2;
+ const std::vector<FileMetaData*>& olevel =
+ *(compaction_->inputs(compaction_->num_input_levels() - 1));
+ for (FileMetaData* file : olevel) {
+ // Worth filtering out by start and end?
+ uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+ // We put old files if they are not too small to prevent a flood
+ // of small files.
+ if (oldest_ancester_time < old_age_thres &&
+ file->fd.GetFileSize() >
+ compaction_->mutable_cf_options()->target_file_size_base / 2) {
+ files_to_cut_for_ttl_.push_back(file);
+ }
+ }
+}
+
+CompactionOutputs::CompactionOutputs(const Compaction* compaction,
+ const bool is_penultimate_level)
+ : compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+ partitioner_ = compaction->output_level() == 0
+ ? nullptr
+ : compaction->CreateSstPartitioner();
+
+ if (compaction->output_level() != 0) {
+ FillFilesToCutForTtl();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.h b/src/rocksdb/db/compaction/compaction_outputs.h
new file mode 100644
index 000000000..f40aa8215
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.h
@@ -0,0 +1,385 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionOutputs;
+using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
+using CompactionFileCloseFunc =
+ std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
+
+// Files produced by subcompaction, most of the functions are used by
+// compaction_job Open/Close compaction file functions.
+class CompactionOutputs {
+ public:
+ // compaction output file
+ struct Output {
+ Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
+ bool _enable_order_check, bool _enable_hash, bool _finished,
+ uint64_t precalculated_hash)
+ : meta(std::move(_meta)),
+ validator(_icmp, _enable_order_check, _enable_hash,
+ precalculated_hash),
+ finished(_finished) {}
+ FileMetaData meta;
+ OutputValidator validator;
+ bool finished;
+ std::shared_ptr<const TableProperties> table_properties;
+ };
+
+ CompactionOutputs() = delete;
+
+ explicit CompactionOutputs(const Compaction* compaction,
+ const bool is_penultimate_level);
+
+ // Add generated output to the list
+ void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
+ bool enable_order_check, bool enable_hash,
+ bool finished = false, uint64_t precalculated_hash = 0) {
+ outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
+ enable_hash, finished, precalculated_hash);
+ }
+
+ // Set new table builder for the current output
+ void NewBuilder(const TableBuilderOptions& tboptions);
+
+ // Assign a new WritableFileWriter to the current output
+ void AssignFileWriter(WritableFileWriter* writer) {
+ file_writer_.reset(writer);
+ }
+
+ // TODO: Remove it when remote compaction support tiered compaction
+ void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
+ void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
+
+ // TODO: Move the BlobDB builder into CompactionOutputs
+ const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
+ if (is_penultimate_level_) {
+ assert(blob_file_additions_.empty());
+ }
+ return blob_file_additions_;
+ }
+
+ std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
+ assert(!is_penultimate_level_);
+ return &blob_file_additions_;
+ }
+
+ bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
+
+ BlobGarbageMeter* CreateBlobGarbageMeter() {
+ assert(!is_penultimate_level_);
+ blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
+ return blob_garbage_meter_.get();
+ }
+
+ BlobGarbageMeter* GetBlobGarbageMeter() const {
+ if (is_penultimate_level_) {
+ // blobdb doesn't support per_key_placement yet
+ assert(blob_garbage_meter_ == nullptr);
+ return nullptr;
+ }
+ return blob_garbage_meter_.get();
+ }
+
+ void UpdateBlobStats() {
+ assert(!is_penultimate_level_);
+ stats_.num_output_files_blob = blob_file_additions_.size();
+ for (const auto& blob : blob_file_additions_) {
+ stats_.bytes_written_blob += blob.GetTotalBlobBytes();
+ }
+ }
+
+ // Finish the current output file
+ Status Finish(const Status& intput_status,
+ const SeqnoToTimeMapping& seqno_time_mapping);
+
+ // Update output table properties from table builder
+ void UpdateTableProperties() {
+ current_output().table_properties =
+ std::make_shared<TableProperties>(GetTableProperties());
+ }
+
+ IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
+ Statistics* statistics, bool use_fsync);
+
+ TableProperties GetTableProperties() {
+ return builder_->GetTableProperties();
+ }
+
+ Slice SmallestUserKey() const {
+ if (!outputs_.empty() && outputs_[0].finished) {
+ return outputs_[0].meta.smallest.user_key();
+ } else {
+ return Slice{nullptr, 0};
+ }
+ }
+
+ Slice LargestUserKey() const {
+ if (!outputs_.empty() && outputs_.back().finished) {
+ return outputs_.back().meta.largest.user_key();
+ } else {
+ return Slice{nullptr, 0};
+ }
+ }
+
+ // In case the last output file is empty, which doesn't need to keep.
+ void RemoveLastEmptyOutput() {
+ if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
+ // An error occurred, so ignore the last output.
+ outputs_.pop_back();
+ }
+ }
+
+ // Remove the last output, for example the last output doesn't have data (no
+ // entry and no range-dels), but file_size might not be 0, as it has SST
+ // metadata.
+ void RemoveLastOutput() {
+ assert(!outputs_.empty());
+ outputs_.pop_back();
+ }
+
+ bool HasBuilder() const { return builder_ != nullptr; }
+
+ FileMetaData* GetMetaData() { return &current_output().meta; }
+
+ bool HasOutput() const { return !outputs_.empty(); }
+
+ uint64_t NumEntries() const { return builder_->NumEntries(); }
+
+ void ResetBuilder() {
+ builder_.reset();
+ current_output_file_size_ = 0;
+ }
+
+ // Add range-dels from the aggregator to the current output file
+ // @param comp_start_user_key and comp_end_user_key include timestamp if
+ // user-defined timestamp is enabled.
+ // @param full_history_ts_low used for range tombstone garbage collection.
+ Status AddRangeDels(const Slice* comp_start_user_key,
+ const Slice* comp_end_user_key,
+ CompactionIterationStats& range_del_out_stats,
+ bool bottommost_level, const InternalKeyComparator& icmp,
+ SequenceNumber earliest_snapshot,
+ const Slice& next_table_min_key,
+ const std::string& full_history_ts_low);
+
+ // if the outputs have range delete, range delete is also data
+ bool HasRangeDel() const {
+ return range_del_agg_ && !range_del_agg_->IsEmpty();
+ }
+
+ private:
+ friend class SubcompactionState;
+
+ void FillFilesToCutForTtl();
+
+ void SetOutputSlitKey(const std::optional<Slice> start,
+ const std::optional<Slice> end) {
+ const InternalKeyComparator* icmp =
+ &compaction_->column_family_data()->internal_comparator();
+
+ const InternalKey* output_split_key = compaction_->GetOutputSplitKey();
+ // Invalid output_split_key indicates that we do not need to split
+ if (output_split_key != nullptr) {
+ // We may only split the output when the cursor is in the range. Split
+ if ((!end.has_value() ||
+ icmp->user_comparator()->Compare(
+ ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
+ (!start.has_value() || icmp->user_comparator()->Compare(
+ ExtractUserKey(output_split_key->Encode()),
+ start.value()) > 0)) {
+ local_output_split_key_ = output_split_key;
+ }
+ }
+ }
+
+ // Returns true iff we should stop building the current output
+ // before processing the current key in compaction iterator.
+ bool ShouldStopBefore(const CompactionIterator& c_iter);
+
+ void Cleanup() {
+ if (builder_ != nullptr) {
+ // May happen if we get a shutdown call in the middle of compaction
+ builder_->Abandon();
+ builder_.reset();
+ }
+ }
+
+ // update tracked grandparents information like grandparent index, if it's
+ // in the gap between 2 grandparent files, accumulated grandparent files size
+ // etc.
+ // It returns how many boundaries it crosses by including current key.
+ size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key);
+
+ // helper function to get the overlapped grandparent files size, it's only
+ // used for calculating the first key's overlap.
+ uint64_t GetCurrentKeyGrandparentOverlappedBytes(
+ const Slice& internal_key) const;
+
+ // Add current key from compaction_iterator to the output file. If needed
+ // close and open new compaction output with the functions provided.
+ Status AddToOutput(const CompactionIterator& c_iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func);
+
+ // Close the current output. `open_file_func` is needed for creating new file
+ // for range-dels only output file.
+ Status CloseOutput(const Status& curr_status,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ Status status = curr_status;
+ // handle subcompaction containing only range deletions
+ if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
+ status = open_file_func(*this);
+ }
+ if (HasBuilder()) {
+ const Slice empty_key{};
+ Status s = close_file_func(*this, status, empty_key);
+ if (!s.ok() && status.ok()) {
+ status = s;
+ }
+ }
+
+ return status;
+ }
+
+ // This subcompaction's output could be empty if compaction was aborted before
+ // this subcompaction had a chance to generate any output files. When
+ // subcompactions are executed sequentially this is more likely and will be
+ // particularly likely for the later subcompactions to be empty. Once they are
+ // run in parallel however it should be much rarer.
+ // It's caller's responsibility to make sure it's not empty.
+ Output& current_output() {
+ assert(!outputs_.empty());
+ return outputs_.back();
+ }
+
+ // Assign the range_del_agg to the target output level. There's only one
+ // range-del-aggregator per compaction outputs, for
+ // output_to_penultimate_level compaction it is only assigned to the
+ // penultimate level.
+ void AssignRangeDelAggregator(
+ std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+ assert(range_del_agg_ == nullptr);
+ range_del_agg_ = std::move(range_del_agg);
+ }
+
+ const Compaction* compaction_;
+
+ // current output builder and writer
+ std::unique_ptr<TableBuilder> builder_;
+ std::unique_ptr<WritableFileWriter> file_writer_;
+ uint64_t current_output_file_size_ = 0;
+
+ // all the compaction outputs so far
+ std::vector<Output> outputs_;
+
+ // BlobDB info
+ std::vector<BlobFileAddition> blob_file_additions_;
+ std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
+
+ // Basic compaction output stats for this level's outputs
+ InternalStats::CompactionOutputsStats stats_;
+
+ // indicate if this CompactionOutputs obj for penultimate_level, should always
+ // be false if per_key_placement feature is not enabled.
+ const bool is_penultimate_level_;
+ std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
+
+ // partitioner information
+ std::string last_key_for_partitioner_;
+ std::unique_ptr<SstPartitioner> partitioner_;
+
+ // A flag determines if this subcompaction has been split by the cursor
+ bool is_split_ = false;
+
+ // We also maintain the output split key for each subcompaction to avoid
+ // repetitive comparison in ShouldStopBefore()
+ const InternalKey* local_output_split_key_ = nullptr;
+
+ // Some identified files with old oldest ancester time and the range should be
+ // isolated out so that the output file(s) in that range can be merged down
+ // for TTL and clear the timestamps for the range.
+ std::vector<FileMetaData*> files_to_cut_for_ttl_;
+ int cur_files_to_cut_for_ttl_ = -1;
+ int next_files_to_cut_for_ttl_ = 0;
+
+ // An index that used to speed up ShouldStopBefore().
+ size_t grandparent_index_ = 0;
+
+ // if the output key is being grandparent files gap, so:
+ // key > grandparents[grandparent_index_ - 1].largest &&
+ // key < grandparents[grandparent_index_].smallest
+ bool being_grandparent_gap_ = true;
+
+ // The number of bytes overlapping between the current output and
+ // grandparent files used in ShouldStopBefore().
+ uint64_t grandparent_overlapped_bytes_ = 0;
+
+ // A flag determines whether the key has been seen in ShouldStopBefore()
+ bool seen_key_ = false;
+
+ // for the current output file, how many file boundaries has it crossed,
+ // basically number of files overlapped * 2
+ size_t grandparent_boundary_switched_num_ = 0;
+};
+
+// helper struct to concatenate the last level and penultimate level outputs
+// which could be replaced by std::ranges::join_view() in c++20
+struct OutputIterator {
+ public:
+ explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
+ const std::vector<CompactionOutputs::Output>& b)
+ : a_(a), b_(b) {
+ within_a = !a_.empty();
+ idx_ = 0;
+ }
+
+ OutputIterator begin() { return *this; }
+
+ OutputIterator end() { return *this; }
+
+ size_t size() { return a_.size() + b_.size(); }
+
+ const CompactionOutputs::Output& operator*() const {
+ return within_a ? a_[idx_] : b_[idx_];
+ }
+
+ OutputIterator& operator++() {
+ idx_++;
+ if (within_a && idx_ >= a_.size()) {
+ within_a = false;
+ idx_ = 0;
+ }
+ assert(within_a || idx_ <= b_.size());
+ return *this;
+ }
+
+ bool operator!=(const OutputIterator& /*rhs*/) const {
+ return within_a || idx_ < b_.size();
+ }
+
+ private:
+ const std::vector<CompactionOutputs::Output>& a_;
+ const std::vector<CompactionOutputs::Output>& b_;
+ bool within_a;
+ size_t idx_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..abdecca9f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1234 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+ size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file,
+ uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno) {
+ // Do not pick ingested file when there is at least one memtable not flushed
+ // which of seqno is overlap with the sst.
+ TEST_SYNC_POINT("FindIntraL0Compaction");
+ size_t start = 0;
+ for (; start < level_files.size(); start++) {
+ if (level_files[start]->being_compacted) {
+ return false;
+ }
+ // If there is no data in memtable, the earliest sequence number would the
+ // largest sequence number in last memtable.
+ // Because all files are sorted in descending order by largest_seqno, so we
+ // only need to check the first one.
+ if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+ break;
+ }
+ }
+ if (start >= level_files.size()) {
+ return false;
+ }
+ size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+ size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max();
+ // Compaction range will be [start, limit).
+ size_t limit;
+ // Pull in files until the amount of compaction work per deleted file begins
+ // increasing or maximum total compaction size is reached.
+ size_t new_compact_bytes_per_del_file = 0;
+ for (limit = start + 1; limit < level_files.size(); ++limit) {
+ compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+ new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+ if (level_files[limit]->being_compacted ||
+ new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+ compact_bytes > max_compaction_bytes) {
+ break;
+ }
+ compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+ }
+
+ if ((limit - start) >= min_files_to_compact &&
+ compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+ assert(comp_inputs != nullptr);
+ comp_inputs->level = 0;
+ for (size_t i = start; i < limit; ++i) {
+ comp_inputs->files.push_back(level_files[i]);
+ }
+ return true;
+ }
+ return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ // disable compression
+ return kNoCompression;
+ }
+
+ // If bottommost_compression is set and we are compacting to the
+ // bottommost level then we should use it.
+ if (mutable_cf_options.bottommost_compression != kDisableCompressionOption &&
+ level >= (vstorage->num_non_empty_levels() - 1)) {
+ return mutable_cf_options.bottommost_compression;
+ }
+ // If the user has specified a different compression level for each level,
+ // then pick the compression for that level.
+ if (!mutable_cf_options.compression_per_level.empty()) {
+ assert(level == 0 || level >= base_level);
+ int idx = (level == 0) ? 0 : level - base_level + 1;
+
+ const int n =
+ static_cast<int>(mutable_cf_options.compression_per_level.size()) - 1;
+ // It is possible for level_ to be -1; in that case, we use level
+ // 0's compression. This occurs mostly in backwards compatibility
+ // situations when the builder doesn't know what level the file
+ // belongs to. Likewise, if level is beyond the end of the
+ // specified compression levels, use the last value.
+ return mutable_cf_options
+ .compression_per_level[std::max(0, std::min(idx, n))];
+ } else {
+ return mutable_cf_options.compression;
+ }
+}
+
+CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options,
+ const VersionStorageInfo* vstorage,
+ int level,
+ const bool enable_compression) {
+ if (!enable_compression) {
+ return cf_options.compression_opts;
+ }
+ // If bottommost_compression_opts is enabled and we are compacting to the
+ // bottommost level then we should use the specified compression options.
+ if (level >= (vstorage->num_non_empty_levels() - 1) &&
+ cf_options.bottommost_compression_opts.enabled) {
+ return cf_options.bottommost_compression_opts;
+ }
+ return cf_options.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+ UnregisterCompaction(c);
+ if (!status.ok()) {
+ c->ResetNextCompactionIndex();
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ const int level = inputs.level;
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+
+ if (level == 0) {
+ for (size_t i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_->Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_->Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+ } else {
+ *smallest = inputs[0]->smallest;
+ *largest = inputs[inputs.size() - 1]->largest;
+ }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest) const {
+ assert(!inputs1.empty() || !inputs2.empty());
+ if (inputs1.empty()) {
+ GetRange(inputs2, smallest, largest);
+ } else if (inputs2.empty()) {
+ GetRange(inputs1, smallest, largest);
+ } else {
+ InternalKey smallest1, smallest2, largest1, largest2;
+ GetRange(inputs1, &smallest1, &largest1);
+ GetRange(inputs2, &smallest2, &largest2);
+ *smallest =
+ icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+ *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+ }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest, InternalKey* largest,
+ int exclude_level) const {
+ InternalKey current_smallest;
+ InternalKey current_largest;
+ bool initialized = false;
+ for (const auto& in : inputs) {
+ if (in.empty() || in.level == exclude_level) {
+ continue;
+ }
+ GetRange(in, &current_smallest, &current_largest);
+ if (!initialized) {
+ *smallest = current_smallest;
+ *largest = current_largest;
+ initialized = true;
+ } else {
+ if (icmp_->Compare(current_smallest, *smallest) < 0) {
+ *smallest = current_smallest;
+ }
+ if (icmp_->Compare(current_largest, *largest) > 0) {
+ *largest = current_largest;
+ }
+ }
+ }
+ assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest) {
+ // This isn't good compaction
+ assert(!inputs->empty());
+
+ const int level = inputs->level;
+ // GetOverlappingInputs will always do the right thing for level-0.
+ // So we don't need to do any expansion if level == 0.
+ if (level == 0) {
+ return true;
+ }
+
+ InternalKey smallest, largest;
+
+ // Keep expanding inputs until we are sure that there is a "clean cut"
+ // boundary between the files in input and the surrounding files.
+ // This will ensure that no parts of a key are lost during compaction.
+ int hint_index = -1;
+ size_t old_size;
+ do {
+ old_size = inputs->size();
+ GetRange(*inputs, &smallest, &largest);
+ inputs->clear();
+ vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+ hint_index, &hint_index, true,
+ next_smallest);
+ } while (inputs->size() > old_size);
+
+ // we started off with inputs non-empty and the previous loop only grew
+ // inputs. thus, inputs should be non-empty here
+ assert(!inputs->empty());
+
+ // If, after the expansion, there are files that are already under
+ // compaction, then we must drop/cancel this compaction.
+ if (AreFilesInCompaction(inputs->files)) {
+ return false;
+ }
+ return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+ const Slice& smallest_user_key, const Slice& largest_user_key,
+ int level) const {
+ const Comparator* ucmp = icmp_->user_comparator();
+ for (Compaction* c : compactions_in_progress_) {
+ if (c->output_level() == level &&
+ ucmp->CompareWithoutTimestamp(smallest_user_key,
+ c->GetLargestUserKey()) <= 0 &&
+ ucmp->CompareWithoutTimestamp(largest_user_key,
+ c->GetSmallestUserKey()) >= 0) {
+ // Overlap
+ return true;
+ }
+ if (c->SupportsPerKeyPlacement()) {
+ if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
+ largest_user_key)) {
+ return true;
+ }
+ }
+ }
+ // Did not overlap with any running compaction in level `level`
+ return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level,
+ int penultimate_level) const {
+ bool is_empty = true;
+ for (auto& in : inputs) {
+ if (!in.empty()) {
+ is_empty = false;
+ break;
+ }
+ }
+ if (is_empty) {
+ // No files in inputs
+ return false;
+ }
+
+ // TODO: Intra L0 compactions can have the ranges overlapped, but the input
+ // files cannot be overlapped in the order of L0 files.
+ InternalKey smallest, largest;
+ GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
+ if (penultimate_level != Compaction::kInvalidLevel) {
+ if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+ penultimate_level)) {
+ return true;
+ }
+ } else {
+ InternalKey penultimate_smallest, penultimate_largest;
+ GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
+ if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
+ penultimate_largest.user_key(),
+ penultimate_level)) {
+ return true;
+ }
+ }
+ }
+
+ return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+ level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+ const std::vector<FileMetaData*>& files) {
+ for (size_t i = 0; i < files.size(); i++) {
+ if (files[i]->being_compacted) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files, int output_level,
+ VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
+#ifndef NDEBUG
+ assert(input_files.size());
+ // This compaction output should not overlap with a running compaction as
+ // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+ // shouldn't have been released since.
+ int start_level = Compaction::kInvalidLevel;
+ for (const auto& in : input_files) {
+ // input_files should already be sorted by level
+ if (!in.empty()) {
+ start_level = in.level;
+ break;
+ }
+ }
+ assert(output_level == 0 ||
+ !FilesRangeOverlapWithCompaction(
+ input_files, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+ start_level, output_level)));
+#endif /* !NDEBUG */
+
+ CompressionType compression_type;
+ if (compact_options.compression == kDisableCompressionOption) {
+ int base_level;
+ if (ioptions_.compaction_style == kCompactionStyleLevel) {
+ base_level = vstorage->base_level();
+ } else {
+ base_level = 1;
+ }
+ compression_type = GetCompressionType(vstorage, mutable_cf_options,
+ output_level, base_level);
+ } else {
+ // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+ // without configurable `CompressionOptions`, which is inconsistent.
+ compression_type = compact_options.compression;
+ }
+ auto c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
+ output_level, compact_options.output_file_size_limit,
+ mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+ GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+ Temperature::kUnknown, compact_options.max_subcompactions,
+ /* grandparents */ {}, true);
+ RegisterCompaction(c);
+ return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+ const CompactionOptions& /*compact_options*/) const {
+ if (input_set->size() == 0U) {
+ return Status::InvalidArgument(
+ "Compaction must include at least one file.");
+ }
+ assert(input_files);
+
+ std::vector<CompactionInputFiles> matched_input_files;
+ matched_input_files.resize(vstorage->num_levels());
+ int first_non_empty_level = -1;
+ int last_non_empty_level = -1;
+ // TODO(yhchiang): use a lazy-initialized mapping from
+ // file_number to FileMetaData in Version.
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ for (auto file : vstorage->LevelFiles(level)) {
+ auto iter = input_set->find(file->fd.GetNumber());
+ if (iter != input_set->end()) {
+ matched_input_files[level].files.push_back(file);
+ input_set->erase(iter);
+ last_non_empty_level = level;
+ if (first_non_empty_level == -1) {
+ first_non_empty_level = level;
+ }
+ }
+ }
+ }
+
+ if (!input_set->empty()) {
+ std::string message(
+ "Cannot find matched SST files for the following file numbers:");
+ for (auto fn : *input_set) {
+ message += " ";
+ message += std::to_string(fn);
+ }
+ return Status::InvalidArgument(message);
+ }
+
+ for (int level = first_non_empty_level; level <= last_non_empty_level;
+ ++level) {
+ matched_input_files[level].level = level;
+ input_files->emplace_back(std::move(matched_input_files[level]));
+ }
+
+ return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest,
+ int level, int* level_index) {
+ std::vector<FileMetaData*> inputs;
+ assert(level < NumberLevels());
+
+ vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+ level_index ? *level_index : 0, level_index);
+ return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs, int* parent_index,
+ int base_index, bool only_expand_towards_right) {
+ assert(!inputs->empty());
+ assert(output_level_inputs->empty());
+ const int input_level = inputs->level;
+ const int output_level = output_level_inputs->level;
+ if (input_level == output_level) {
+ // no possibility of conflict
+ return true;
+ }
+
+ // For now, we only support merging two levels, start level and output level.
+ // We need to assert other levels are empty.
+ for (int l = input_level + 1; l < output_level; l++) {
+ assert(vstorage->NumLevelFiles(l) == 0);
+ }
+
+ InternalKey smallest, largest;
+
+ // Get the range one last time.
+ GetRange(*inputs, &smallest, &largest);
+
+ // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+ // include in compaction
+ vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+ &output_level_inputs->files, *parent_index,
+ parent_index);
+ if (AreFilesInCompaction(output_level_inputs->files)) {
+ return false;
+ }
+ if (!output_level_inputs->empty()) {
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+ return false;
+ }
+ }
+
+ // See if we can further grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up. We also choose NOT
+ // to expand if this would cause "level" to include some entries for some
+ // user key, while excluding other entries for the same user key. This
+ // can happen when one user key spans multiple files.
+ if (!output_level_inputs->empty()) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ const uint64_t output_level_inputs_size =
+ TotalFileSize(output_level_inputs->files);
+ const uint64_t inputs_size = TotalFileSize(inputs->files);
+ bool expand_inputs = false;
+
+ CompactionInputFiles expanded_inputs;
+ expanded_inputs.level = input_level;
+ // Get closed interval of output level
+ InternalKey all_start, all_limit;
+ GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+ bool try_overlapping_inputs = true;
+ if (only_expand_towards_right) {
+ // Round-robin compaction only allows expansion towards the larger side.
+ vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
+ &expanded_inputs.files, base_index,
+ nullptr);
+ } else {
+ vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+ &expanded_inputs.files, base_index,
+ nullptr);
+ }
+ uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+ if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+ try_overlapping_inputs = false;
+ }
+ if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+ (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+ output_level_inputs_size + expanded_inputs_size < limit) &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded_inputs, &new_start, &new_limit);
+ CompactionInputFiles expanded_output_level_inputs;
+ expanded_output_level_inputs.level = output_level;
+ vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+ &expanded_output_level_inputs.files,
+ *parent_index, parent_index);
+ assert(!expanded_output_level_inputs.empty());
+ if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+ ExpandInputsToCleanCut(cf_name, vstorage,
+ &expanded_output_level_inputs) &&
+ expanded_output_level_inputs.size() == output_level_inputs->size()) {
+ expand_inputs = true;
+ }
+ }
+ if (!expand_inputs) {
+ vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+ &all_limit, &expanded_inputs.files,
+ base_index, nullptr);
+ expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+ if (expanded_inputs.size() > inputs->size() &&
+ (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+ output_level_inputs_size + expanded_inputs_size < limit) &&
+ !AreFilesInCompaction(expanded_inputs.files)) {
+ expand_inputs = true;
+ }
+ }
+ if (expand_inputs) {
+ ROCKS_LOG_INFO(ioptions_.logger,
+ "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+ "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+ "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+ cf_name.c_str(), input_level, inputs->size(),
+ output_level_inputs->size(), inputs_size,
+ output_level_inputs_size, expanded_inputs.size(),
+ output_level_inputs->size(), expanded_inputs_size,
+ output_level_inputs_size);
+ inputs->files = expanded_inputs.files;
+ }
+ } else {
+ // Likely to be trivial move. Expand files if they are still trivial moves,
+ // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that
+ // we don't create too much compaction pressure for the next level.
+ }
+ return true;
+}
+
+void CompactionPicker::GetGrandparents(
+ VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents) {
+ InternalKey start, limit;
+ GetRange(inputs, output_level_inputs, &start, &limit);
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2 or the first
+ // level after that has overlapping files)
+ for (int level = output_level_inputs.level + 1; level < NumberLevels();
+ level++) {
+ vstorage->GetOverlappingInputs(level, &start, &limit, grandparents);
+ if (!grandparents->empty()) {
+ break;
+ }
+ }
+}
+
+Compaction* CompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+ const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+ // CompactionPickerFIFO has its own implementation of compact range
+ assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+ if (input_level == ColumnFamilyData::kCompactAllLevels) {
+ assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+ // Universal compaction with more than one level always compacts all the
+ // files together to the last level.
+ assert(vstorage->num_levels() > 1);
+ // DBImpl::CompactRange() set output level to be the last level
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level == vstorage->num_levels() - 2);
+ } else {
+ assert(output_level == vstorage->num_levels() - 1);
+ }
+ // DBImpl::RunManualCompaction will make full range for universal compaction
+ assert(begin == nullptr);
+ assert(end == nullptr);
+ *compaction_end = nullptr;
+
+ int start_level = 0;
+ for (; start_level < vstorage->num_levels() &&
+ vstorage->NumLevelFiles(start_level) == 0;
+ start_level++) {
+ }
+ if (start_level == vstorage->num_levels()) {
+ return nullptr;
+ }
+
+ if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ *manual_conflict = true;
+ // Only one level 0 compaction allowed
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+ start_level);
+ for (int level = start_level; level < vstorage->num_levels(); level++) {
+ inputs[level - start_level].level = level;
+ auto& files = inputs[level - start_level].files;
+ for (FileMetaData* f : vstorage->LevelFiles(level)) {
+ files.push_back(f);
+ }
+ if (AreFilesInCompaction(files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+ start_level, output_level))) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style),
+ /* max_compaction_bytes */ LLONG_MAX,
+ compact_range_options.target_path_id,
+ GetCompressionType(vstorage, mutable_cf_options, output_level, 1),
+ GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+ Temperature::kUnknown, compact_range_options.max_subcompactions,
+ /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1,
+ /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+ CompactionReason::kUnknown,
+ compact_range_options.blob_garbage_collection_policy,
+ compact_range_options.blob_garbage_collection_age_cutoff);
+
+ RegisterCompaction(c);
+ vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+ return c;
+ }
+
+ CompactionInputFiles inputs;
+ inputs.level = input_level;
+ bool covering_the_whole_range = true;
+
+ // All files are 'overlapping' in universal style compaction.
+ // We have to compact the entire range in one shot.
+ if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+ begin = nullptr;
+ end = nullptr;
+ }
+
+ vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+ if (inputs.empty()) {
+ return nullptr;
+ }
+
+ if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+ // Only one level 0 compaction allowed
+ TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ // Avoid compacting too much in one shot in case the range is large.
+ // But we cannot do this for level-0 since level-0 files can overlap
+ // and we must not pick one file and drop another older file if the
+ // two files overlap.
+ if (input_level > 0) {
+ const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+ uint64_t input_level_total = 0;
+ int hint_index = -1;
+ InternalKey* smallest = nullptr;
+ InternalKey* largest = nullptr;
+ for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+ if (!smallest) {
+ smallest = &inputs[i]->smallest;
+ }
+ largest = &inputs[i]->largest;
+
+ uint64_t input_file_size = inputs[i]->fd.GetFileSize();
+ uint64_t output_level_total = 0;
+ if (output_level < vstorage->num_non_empty_levels()) {
+ std::vector<FileMetaData*> files;
+ vstorage->GetOverlappingInputsRangeBinarySearch(
+ output_level, smallest, largest, &files, hint_index, &hint_index);
+ for (const auto& file : files) {
+ output_level_total += file->fd.GetFileSize();
+ }
+ }
+
+ input_level_total += input_file_size;
+
+ if (input_level_total + output_level_total >= limit) {
+ covering_the_whole_range = false;
+ // still include the current file, so the compaction could be larger
+ // than max_compaction_bytes, which is also to make sure the compaction
+ // can make progress even `max_compaction_bytes` is small (e.g. smaller
+ // than an SST file).
+ inputs.files.resize(i + 1);
+ break;
+ }
+ }
+ }
+
+ assert(compact_range_options.target_path_id <
+ static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+ // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+ // files that are created during the current compaction.
+ if (compact_range_options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kForceOptimized &&
+ max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
+ assert(input_level == output_level);
+ // inputs_shrunk holds a continuous subset of input files which were all
+ // created before the current manual compaction
+ std::vector<FileMetaData*> inputs_shrunk;
+ size_t skip_input_index = inputs.size();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ inputs_shrunk.push_back(inputs[i]);
+ } else if (!inputs_shrunk.empty()) {
+ // inputs[i] was created during the current manual compaction and
+ // need to be skipped
+ skip_input_index = i;
+ break;
+ }
+ }
+ if (inputs_shrunk.empty()) {
+ return nullptr;
+ }
+ if (inputs.size() != inputs_shrunk.size()) {
+ inputs.files.swap(inputs_shrunk);
+ }
+ // set covering_the_whole_range to false if there is any file that need to
+ // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+ for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+ if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+ covering_the_whole_range = false;
+ }
+ }
+ }
+
+ InternalKey key_storage;
+ InternalKey* next_smallest = &key_storage;
+ if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+ false) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that ExpandWhileOverlapping fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ if (covering_the_whole_range || !next_smallest) {
+ *compaction_end = nullptr;
+ } else {
+ **compaction_end = *next_smallest;
+ }
+
+ CompactionInputFiles output_level_inputs;
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ assert(input_level == 0);
+ output_level = vstorage->base_level();
+ assert(output_level > 0);
+ }
+ output_level_inputs.level = output_level;
+ if (input_level != output_level) {
+ int parent_index = -1;
+ if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+ &output_level_inputs, &parent_index, -1)) {
+ // manual compaction is now multi-threaded, so it can
+ // happen that SetupOtherInputs fails
+ // we handle it higher in RunManualCompaction
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ std::vector<CompactionInputFiles> compaction_inputs({inputs});
+ if (!output_level_inputs.empty()) {
+ compaction_inputs.push_back(output_level_inputs);
+ }
+ for (size_t i = 0; i < compaction_inputs.size(); i++) {
+ if (AreFilesInCompaction(compaction_inputs[i].files)) {
+ *manual_conflict = true;
+ return nullptr;
+ }
+ }
+
+ // 2 non-exclusive manual compactions could run at the same time producing
+ // overlaping outputs in the same level.
+ if (FilesRangeOverlapWithCompaction(
+ compaction_inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level,
+ output_level))) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ *manual_conflict = true;
+ return nullptr;
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+ Compaction* compaction = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(compaction_inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options, output_level,
+ ioptions_.compaction_style, vstorage->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options.max_compaction_bytes,
+ compact_range_options.target_path_id,
+ GetCompressionType(vstorage, mutable_cf_options, output_level,
+ vstorage->base_level()),
+ GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+ Temperature::kUnknown, compact_range_options.max_subcompactions,
+ std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1,
+ /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+ CompactionReason::kUnknown,
+ compact_range_options.blob_garbage_collection_policy,
+ compact_range_options.blob_garbage_collection_age_cutoff);
+
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+ RegisterCompaction(compaction);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+ return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+ const SstFileMetaData& b) {
+ if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+ if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+} // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ auto& levels = cf_meta.levels;
+ auto comparator = icmp_->user_comparator();
+
+ // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+ // the smallest and largest key of the current compaction input
+ std::string smallestkey;
+ std::string largestkey;
+ // a flag for initializing smallest and largest key
+ bool is_first = false;
+ const int kNotFound = -1;
+
+ // For each level, it does the following things:
+ // 1. Find the first and the last compaction input files
+ // in the current level.
+ // 2. Include all files between the first and the last
+ // compaction input files.
+ // 3. Update the compaction key-range.
+ // 4. For all remaining levels, include files that have
+ // overlapping key-range with the compaction key-range.
+ for (int l = 0; l <= output_level; ++l) {
+ auto& current_files = levels[l].files;
+ int first_included = static_cast<int>(current_files.size());
+ int last_included = kNotFound;
+
+ // identify the first and the last compaction input files
+ // in the current level.
+ for (size_t f = 0; f < current_files.size(); ++f) {
+ const uint64_t file_number = TableFileNameToNumber(current_files[f].name);
+ if (input_files->find(file_number) == input_files->end()) {
+ continue;
+ }
+ first_included = std::min(first_included, static_cast<int>(f));
+ last_included = std::max(last_included, static_cast<int>(f));
+ if (is_first == false) {
+ smallestkey = current_files[f].smallestkey;
+ largestkey = current_files[f].largestkey;
+ is_first = true;
+ }
+ }
+ if (last_included == kNotFound) {
+ continue;
+ }
+
+ if (l != 0) {
+ // expand the compaction input of the current level if it
+ // has overlapping key-range with other non-compaction input
+ // files in the same level.
+ while (first_included > 0) {
+ if (comparator->CompareWithoutTimestamp(
+ current_files[first_included - 1].largestkey,
+ current_files[first_included].smallestkey) < 0) {
+ break;
+ }
+ first_included--;
+ }
+
+ while (last_included < static_cast<int>(current_files.size()) - 1) {
+ if (comparator->CompareWithoutTimestamp(
+ current_files[last_included + 1].smallestkey,
+ current_files[last_included].largestkey) > 0) {
+ break;
+ }
+ last_included++;
+ }
+ } else if (output_level > 0) {
+ last_included = static_cast<int>(current_files.size() - 1);
+ }
+
+ // include all files between the first and the last compaction input files.
+ for (int f = first_included; f <= last_included; ++f) {
+ if (current_files[f].being_compacted) {
+ return Status::Aborted("Necessary compaction input file " +
+ current_files[f].name +
+ " is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(current_files[f].name));
+ }
+
+ // update smallest and largest key
+ if (l == 0) {
+ for (int f = first_included; f <= last_included; ++f) {
+ if (comparator->CompareWithoutTimestamp(
+ smallestkey, current_files[f].smallestkey) > 0) {
+ smallestkey = current_files[f].smallestkey;
+ }
+ if (comparator->CompareWithoutTimestamp(
+ largestkey, current_files[f].largestkey) < 0) {
+ largestkey = current_files[f].largestkey;
+ }
+ }
+ } else {
+ if (comparator->CompareWithoutTimestamp(
+ smallestkey, current_files[first_included].smallestkey) > 0) {
+ smallestkey = current_files[first_included].smallestkey;
+ }
+ if (comparator->CompareWithoutTimestamp(
+ largestkey, current_files[last_included].largestkey) < 0) {
+ largestkey = current_files[last_included].largestkey;
+ }
+ }
+
+ SstFileMetaData aggregated_file_meta;
+ aggregated_file_meta.smallestkey = smallestkey;
+ aggregated_file_meta.largestkey = largestkey;
+
+ // For all lower levels, include all overlapping files.
+ // We need to add overlapping files from the current level too because even
+ // if there no input_files in level l, we would still need to add files
+ // which overlap with the range containing the input_files in levels 0 to l
+ // Level 0 doesn't need to be handled this way because files are sorted by
+ // time and not by key
+ for (int m = std::max(l, 1); m <= output_level; ++m) {
+ for (auto& next_lv_file : levels[m].files) {
+ if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+ next_lv_file)) {
+ if (next_lv_file.being_compacted) {
+ return Status::Aborted(
+ "File " + next_lv_file.name +
+ " that has overlapping key range with one of the compaction "
+ " input file is currently being compacted.");
+ }
+ input_files->insert(TableFileNameToNumber(next_lv_file.name));
+ }
+ }
+ }
+ }
+ if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+ return Status::Aborted(
+ "A running compaction is writing to the same output level in an "
+ "overlapping key range");
+ }
+ return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+ assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+ cf_meta.levels[cf_meta.levels.size() - 1].level);
+ if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+ return Status::InvalidArgument(
+ "Output level for column family " + cf_meta.name +
+ " must between [0, " +
+ std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+ }
+
+ if (output_level > MaxOutputLevel()) {
+ return Status::InvalidArgument(
+ "Exceed the maximum output level defined by "
+ "the current compaction algorithm --- " +
+ std::to_string(MaxOutputLevel()));
+ }
+
+ if (output_level < 0) {
+ return Status::InvalidArgument("Output level cannot be negative.");
+ }
+
+ if (input_files->size() == 0) {
+ return Status::InvalidArgument(
+ "A compaction must contain at least one file.");
+ }
+
+ Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+ output_level);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // for all input files, check whether the file number matches
+ // any currently-existing files.
+ for (auto file_num : *input_files) {
+ bool found = false;
+ int input_file_level = -1;
+ for (const auto& level_meta : cf_meta.levels) {
+ for (const auto& file_meta : level_meta.files) {
+ if (file_num == TableFileNameToNumber(file_meta.name)) {
+ if (file_meta.being_compacted) {
+ return Status::Aborted("Specified compaction input file " +
+ MakeTableFileName("", file_num) +
+ " is already being compacted.");
+ }
+ found = true;
+ input_file_level = level_meta.level;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (!found) {
+ return Status::InvalidArgument(
+ "Specified compaction input file " + MakeTableFileName("", file_num) +
+ " does not exist in column family " + cf_meta.name + ".");
+ }
+ if (input_file_level > output_level) {
+ return Status::InvalidArgument(
+ "Cannot compact file to up level, input file: " +
+ MakeTableFileName("", file_num) + " level " +
+ std::to_string(input_file_level) + " > output level " +
+ std::to_string(output_level));
+ }
+ }
+
+ return Status::OK();
+}
+#endif // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+ c->output_level() == 0 ||
+ !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(),
+ c->GetPenultimateLevel()));
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.insert(c);
+ }
+ compactions_in_progress_.insert(c);
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered",
+ c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+ if (c == nullptr) {
+ return;
+ }
+ if (c->start_level() == 0 ||
+ ioptions_.compaction_style == kCompactionStyleUniversal) {
+ level0_compactions_in_progress_.erase(c);
+ }
+ compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+ const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+ int* output_level, CompactionInputFiles* start_level_inputs) {
+ if (vstorage->FilesMarkedForCompaction().empty()) {
+ return;
+ }
+
+ auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ *start_level = level_file.first;
+ *output_level =
+ (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+ if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+ return false;
+ }
+
+ start_level_inputs->files = {level_file.second};
+ start_level_inputs->level = *start_level;
+ return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+ };
+
+ // take a chance on a random file first
+ Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+ size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+ static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+ TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction",
+ &random_file_index);
+
+ if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+ // found the compaction!
+ return;
+ }
+
+ for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+ if (continuation(level_file)) {
+ // found the compaction!
+ return;
+ }
+ }
+ start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+ VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index) {
+ // Two level 0 compaction won't run at the same time, so don't need to worry
+ // about files on level 0 being compacted.
+ assert(level0_compactions_in_progress()->empty());
+ InternalKey smallest, largest;
+ GetRange(*start_level_inputs, &smallest, &largest);
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ start_level_inputs->files.clear();
+ vstorage->GetOverlappingInputs(0, &smallest, &largest,
+ &(start_level_inputs->files));
+
+ // If we include more L0 files in the same compaction run it can
+ // cause the 'smallest' and 'largest' key to get extended to a
+ // larger range. So, re-invoke GetRange to get the new key range
+ GetRange(*start_level_inputs, &smallest, &largest);
+ if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+ parent_index)) {
+ return false;
+ }
+ assert(!start_level_inputs->files.empty());
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..7739dd96b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,323 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+ CompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp);
+ virtual ~CompactionPicker();
+
+ // Pick level and inputs for a new compaction.
+ // Returns nullptr if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns nullptr if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ //
+ // The returned Compaction might not include the whole requested range.
+ // In that case, compaction_end will be set to the next key that needs
+ // compacting. In case the compaction will compact the whole range,
+ // compaction_end will be set to nullptr.
+ // Client is responsible for compaction_end storage -- when called,
+ // *compaction_end should point to valid InternalKey!
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts);
+
+ // The maximum allowed output level. Default value is NumberLevels() - 1.
+ virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files. If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+ Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta,
+ const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Free up the files that participated in a compaction
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Compaction* c, Status status);
+
+ // Returns true if any one of the specified files are being compacted
+ bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+ // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+ // object.
+ //
+ // Caller must provide a set of input files that has been passed through
+ // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+ // between that call and this one.
+ Compaction* CompactFiles(const CompactionOptions& compact_options,
+ const std::vector<CompactionInputFiles>& input_files,
+ int output_level, VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ uint32_t output_path_id);
+
+ // Converts a set of compaction input file numbers into
+ // a list of CompactionInputFiles.
+ Status GetCompactionInputsFromFileNumbers(
+ std::vector<CompactionInputFiles>* input_files,
+ std::unordered_set<uint64_t>* input_set,
+ const VersionStorageInfo* vstorage,
+ const CompactionOptions& compact_options) const;
+
+ // Is there currently a compaction involving level 0 taking place
+ bool IsLevel0CompactionInProgress() const {
+ return !level0_compactions_in_progress_.empty();
+ }
+
+ // Return true if the passed key range overlap with a compaction output
+ // that is currently running.
+ bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+ const Slice& largest_user_key,
+ int level) const;
+
+ // Stores the minimal range that covers all entries in inputs in
+ // *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs1 and inputs2
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty
+ void GetRange(const CompactionInputFiles& inputs1,
+ const CompactionInputFiles& inputs2, InternalKey* smallest,
+ InternalKey* largest) const;
+
+ // Stores the minimal range that covers all entries in inputs
+ // in *smallest, *largest.
+ // REQUIRES: inputs is not empty (at least on entry have one file)
+ void GetRange(const std::vector<CompactionInputFiles>& inputs,
+ InternalKey* smallest, InternalKey* largest,
+ int exclude_level) const;
+
+ int NumberLevels() const { return ioptions_.num_levels; }
+
+ // Add more files to the inputs on "level" to make sure that
+ // no newer version of a key is compacted to "level+1" while leaving an older
+ // version in a "level". Otherwise, any Get() will search "level" first,
+ // and will likely return an old/stale value for the key, since it always
+ // searches in increasing order of level to find the value. This could
+ // also scramble the order of merge operands. This function should be
+ // called any time a new Compaction is created, and its inputs_[0] are
+ // populated.
+ //
+ // Will return false if it is impossible to apply this compaction.
+ bool ExpandInputsToCleanCut(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ InternalKey** next_smallest = nullptr);
+
+ // Returns true if any one of the parent files are being compacted
+ bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+ const InternalKey* smallest,
+ const InternalKey* largest, int level, int* index);
+
+ // Returns true if the key range that `inputs` files cover overlap with the
+ // key range of a currently running compaction.
+ bool FilesRangeOverlapWithCompaction(
+ const std::vector<CompactionInputFiles>& inputs, int level,
+ int penultimate_level) const;
+
+ bool SetupOtherInputs(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ VersionStorageInfo* vstorage,
+ CompactionInputFiles* inputs,
+ CompactionInputFiles* output_level_inputs,
+ int* parent_index, int base_index,
+ bool only_expand_towards_right = false);
+
+ void GetGrandparents(VersionStorageInfo* vstorage,
+ const CompactionInputFiles& inputs,
+ const CompactionInputFiles& output_level_inputs,
+ std::vector<FileMetaData*>* grandparents);
+
+ void PickFilesMarkedForCompaction(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ int* start_level, int* output_level,
+ CompactionInputFiles* start_level_inputs);
+
+ bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+ CompactionInputFiles* start_level_inputs,
+ int output_level, int* parent_index);
+
+ // Register this compaction in the set of running compactions
+ void RegisterCompaction(Compaction* c);
+
+ // Remove this compaction from the set of running compactions
+ void UnregisterCompaction(Compaction* c);
+
+ std::set<Compaction*>* level0_compactions_in_progress() {
+ return &level0_compactions_in_progress_;
+ }
+ std::unordered_set<Compaction*>* compactions_in_progress() {
+ return &compactions_in_progress_;
+ }
+
+ const InternalKeyComparator* icmp() const { return icmp_; }
+
+ protected:
+ const ImmutableOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+ virtual Status SanitizeCompactionInputFilesForAllLevels(
+ std::unordered_set<uint64_t>* input_files,
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif // ROCKSDB_LITE
+
+ // Keeps track of all compactions that are running on Level0.
+ // Protected by DB mutex
+ std::set<Compaction*> level0_compactions_in_progress_;
+
+ // Keeps track of all compactions that are running.
+ // Protected by DB mutex
+ std::unordered_set<Compaction*> compactions_in_progress_;
+
+ const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+ NullCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual ~NullCompactionPicker() {}
+
+ // Always return "nullptr"
+ Compaction* PickCompaction(
+ const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ const MutableDBOptions& /*mutable_db_options*/,
+ VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+ SequenceNumber /* earliest_memtable_seqno */) override {
+ return nullptr;
+ }
+
+ // Always return "nullptr"
+ Compaction* CompactRange(const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ const MutableDBOptions& /*mutable_db_options*/,
+ VersionStorageInfo* /*vstorage*/,
+ int /*input_level*/, int /*output_level*/,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/,
+ const InternalKey* /*end*/,
+ InternalKey** /*compaction_end*/,
+ bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/,
+ const std::string& /*trim_ts*/) override {
+ return nullptr;
+ }
+
+ // Always returns false.
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* /*vstorage*/) const override {
+ return false;
+ }
+};
+#endif // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files Metadata for L0 files.
+// @param min_files_to_compact Minimum number of files required to
+// do the compaction.
+// @param max_compact_bytes_per_del_file Maximum average size in bytes per
+// file that is going to get deleted by
+// the compaction.
+// @param max_compaction_bytes Maximum total size in bytes (in terms
+// of compensated file size) for files
+// to be compacted.
+// @param [out] comp_inputs If a compaction was found, will be
+// initialized with corresponding input
+// files. Cannot be nullptr.
+//
+// @return true iff compaction was found.
+bool FindIntraL0Compaction(
+ const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs,
+ SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+ const MutableCFOptions& mutable_cf_options,
+ int level, int base_level,
+ const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(
+ const MutableCFOptions& mutable_cf_options,
+ const VersionStorageInfo* vstorage, int level,
+ const bool enable_compression = true);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..1f875e3e1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,433 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+ uint64_t total_size = 0;
+ for (const auto& f : files) {
+ total_size += f->fd.file_size;
+ }
+ return total_size;
+}
+} // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) {
+ assert(mutable_cf_options.ttl > 0);
+
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+ uint64_t total_size = GetTotalFilesSize(level_files);
+
+ int64_t _current_time;
+ auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: Couldn't get current time: %s. "
+ "Not doing compactions based on TTL. ",
+ cf_name.c_str(), status.ToString().c_str());
+ return nullptr;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ // avoid underflow
+ if (current_time > mutable_cf_options.ttl) {
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ FileMetaData* f = *ritr;
+ assert(f);
+ if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ uint64_t creation_time =
+ f->fd.table_reader->GetTableProperties()->creation_time;
+ if (creation_time == 0 ||
+ creation_time >= (current_time - mutable_cf_options.ttl)) {
+ break;
+ }
+ }
+ total_size -= f->fd.file_size;
+ inputs[0].files.push_back(f);
+ }
+ }
+
+ // Return a nullptr and proceed to size-based FIFO compaction if:
+ // 1. there are no files older than ttl OR
+ // 2. there are a few files older than ttl, but deleting them will not bring
+ // the total size to be less than max_table_files_size threshold.
+ if (inputs[0].files.empty() ||
+ total_size >
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ return nullptr;
+ }
+
+ for (const auto& f : inputs[0].files) {
+ uint64_t creation_time = 0;
+ assert(f);
+ if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+ creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+ }
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with creation time %" PRIu64 " for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), creation_time);
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), 0, 0, 0, 0, kNoCompression,
+ mutable_cf_options.compression_opts, Temperature::kUnknown,
+ /* max_subcompactions */ 0, {}, /* is manual */ false,
+ /* trim_ts */ "", vstorage->CompactionScore(0),
+ /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
+ CompactionReason::kFIFOTtl);
+ return c;
+}
+
+// The size-based compaction picker for FIFO.
+//
+// When the entire column family size exceeds max_table_files_size, FIFO will
+// try to delete the oldest sst file(s) until the resulting column family size
+// is smaller than max_table_files_size.
+//
+// This function also takes care the case where a DB is migrating from level /
+// universal compaction to FIFO compaction. During the migration, the column
+// family will also have non-L0 files while FIFO can only create L0 files.
+// In this case, this function will first purge the sst files in the bottom-
+// most non-empty level first, and the DB will eventually converge to the
+// regular FIFO case where there're only L0 files. Note that during the
+// migration case, the purge order will only be an approximation of "FIFO"
+// as entries inside lower-level files might sometimes be newer than some
+// entries inside upper-level files.
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) {
+ // compute the total size and identify the last non-empty level
+ int last_level = 0;
+ uint64_t total_size = 0;
+ for (int level = 0; level < vstorage->num_levels(); ++level) {
+ auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level));
+ total_size += level_size;
+ if (level_size > 0) {
+ last_level = level;
+ }
+ }
+ const std::vector<FileMetaData*>& last_level_files =
+ vstorage->LevelFiles(last_level);
+
+ if (last_level == 0 &&
+ total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ // total size not exceeded, try to find intra level 0 compaction if enabled
+ const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+ if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+ level0_files.size() > 0) {
+ CompactionInputFiles comp_inputs;
+ // try to prevent same files from being compacted multiple times, which
+ // could produce large files that may never TTL-expire. Achieve this by
+ // disallowing compactions with files larger than memtable (inflate its
+ // size by 10% to account for uncompressed L0 files that may have size
+ // slightly greater than memtable size limit).
+ size_t max_compact_bytes_per_del_file =
+ static_cast<size_t>(MultiplyCheckOverflow(
+ static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+ 1.1));
+ if (FindIntraL0Compaction(
+ level0_files,
+ mutable_cf_options
+ .level0_file_num_compaction_trigger /* min_files_to_compact */
+ ,
+ max_compact_bytes_per_del_file,
+ mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
+ 0 /* max compaction bytes, not applicable */,
+ 0 /* output path ID */, mutable_cf_options.compression,
+ mutable_cf_options.compression_opts, Temperature::kUnknown,
+ 0 /* max_subcompactions */, {}, /* is manual */ false,
+ /* trim_ts */ "", vstorage->CompactionScore(0),
+ /* is deletion compaction */ false,
+ /* l0_files_might_overlap */ true,
+ CompactionReason::kFIFOReduceNumFiles);
+ return c;
+ }
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+ ", max size %" PRIu64 "\n",
+ cf_name.c_str(), total_size,
+ mutable_cf_options.compaction_options_fifo.max_table_files_size);
+ return nullptr;
+ }
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. No need "
+ "to run parallel compactions since compactions are very fast",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = last_level;
+
+ if (last_level == 0) {
+ // In L0, right-most files are the oldest files.
+ for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
+ ++ritr) {
+ auto f = *ritr;
+ total_size -= f->fd.file_size;
+ inputs[0].files.push_back(f);
+ char tmp_fsize[16];
+ AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with size %s for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ break;
+ }
+ }
+ } else {
+ // If the last level is non-L0, we actually don't know which file is
+ // logically the oldest since the file creation time only represents
+ // when this file was compacted to this level, which is independent
+ // to when the entries in this file were first inserted.
+ //
+ // As a result, we delete files from the left instead. This means the sst
+ // file with the smallest key will be deleted first. This design decision
+ // better serves a major type of FIFO use cases where smaller keys are
+ // associated with older data.
+ for (const auto& f : last_level_files) {
+ total_size -= f->fd.file_size;
+ inputs[0].files.push_back(f);
+ char tmp_fsize[16];
+ AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with size %s for deletion",
+ cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+ if (total_size <=
+ mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+ break;
+ }
+ }
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), last_level,
+ /* target_file_size */ 0,
+ /* max_compaction_bytes */ 0,
+ /* output_path_id */ 0, kNoCompression,
+ mutable_cf_options.compression_opts, Temperature::kUnknown,
+ /* max_subcompactions */ 0, {}, /* is manual */ false,
+ /* trim_ts */ "", vstorage->CompactionScore(0),
+ /* is deletion compaction */ true,
+ /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompactionToWarm(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) {
+ if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
+ return nullptr;
+ }
+
+ // PickCompactionToWarm is only triggered if there is no non-L0 files.
+ for (int level = 1; level < vstorage->num_levels(); ++level) {
+ if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
+ return nullptr;
+ }
+ }
+
+ const int kLevel0 = 0;
+ const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+
+ int64_t _current_time;
+ auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+ if (!status.ok()) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: Couldn't get current time: %s. "
+ "Not doing compactions based on warm threshold. ",
+ cf_name.c_str(), status.ToString().c_str());
+ return nullptr;
+ }
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+ if (!level0_compactions_in_progress_.empty()) {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] FIFO compaction: Already executing compaction. Parallel "
+ "compactions are not supported",
+ cf_name.c_str());
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ inputs.emplace_back();
+ inputs[0].level = 0;
+
+ // avoid underflow
+ if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
+ uint64_t create_time_threshold =
+ current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
+ uint64_t compaction_size = 0;
+ // We will ideally identify a file qualifying for warm tier by knowing
+ // the timestamp for the youngest entry in the file. However, right now
+ // we don't have the information. We infer it by looking at timestamp
+ // of the next file's (which is just younger) oldest entry's timestamp.
+ FileMetaData* prev_file = nullptr;
+ for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+ FileMetaData* f = *ritr;
+ assert(f);
+ if (f->being_compacted) {
+ // Right now this probably won't happen as we never try to schedule
+ // two compactions in parallel, so here we just simply don't schedule
+ // anything.
+ return nullptr;
+ }
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time == kUnknownOldestAncesterTime) {
+ // Older files might not have enough information. It is possible to
+ // handle these files by looking at newer files, but maintaining the
+ // logic isn't worth it.
+ break;
+ }
+ if (oldest_ancester_time > create_time_threshold) {
+ // The previous file (which has slightly older data) doesn't qualify
+ // for warm tier.
+ break;
+ }
+ if (prev_file != nullptr) {
+ compaction_size += prev_file->fd.GetFileSize();
+ if (compaction_size > mutable_cf_options.max_compaction_bytes) {
+ break;
+ }
+ inputs[0].files.push_back(prev_file);
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] FIFO compaction: picking file %" PRIu64
+ " with next file's oldest time %" PRIu64 " for warm",
+ cf_name.c_str(), prev_file->fd.GetNumber(),
+ oldest_ancester_time);
+ }
+ if (f->temperature == Temperature::kUnknown ||
+ f->temperature == Temperature::kHot) {
+ prev_file = f;
+ } else if (!inputs[0].files.empty()) {
+ // A warm file newer than files picked.
+ break;
+ } else {
+ assert(prev_file == nullptr);
+ }
+ }
+ }
+
+ if (inputs[0].files.empty()) {
+ return nullptr;
+ }
+
+ Compaction* c = new Compaction(
+ vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+ std::move(inputs), 0, 0 /* output file size limit */,
+ 0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+ mutable_cf_options.compression, mutable_cf_options.compression_opts,
+ Temperature::kWarm,
+ /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
+ vstorage->CompactionScore(0),
+ /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
+ CompactionReason::kChangeTemperature);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) {
+ Compaction* c = nullptr;
+ if (mutable_cf_options.ttl > 0) {
+ c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
+ vstorage, log_buffer);
+ }
+ if (c == nullptr) {
+ c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
+ vstorage, log_buffer);
+ }
+ if (c == nullptr) {
+ c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
+ vstorage, log_buffer);
+ }
+ RegisterCompaction(c);
+ return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& /*compact_range_options*/,
+ const InternalKey* /*begin*/, const InternalKey* /*end*/,
+ InternalKey** compaction_end, bool* /*manual_conflict*/,
+ uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
+#ifdef NDEBUG
+ (void)input_level;
+ (void)output_level;
+#endif
+ assert(input_level == 0);
+ assert(output_level == 0);
+ *compaction_end = nullptr;
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
+ Compaction* c = PickCompaction(cf_name, mutable_cf_options,
+ mutable_db_options, vstorage, &log_buffer);
+ log_buffer.FlushBufferToLog();
+ return c;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..544259f38
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+ FIFOCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* version,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual Compaction* CompactRange(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const InternalKey* begin, const InternalKey* end,
+ InternalKey** compaction_end, bool* manual_conflict,
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) override;
+
+ // The maximum allowed output level. Always returns 0.
+ virtual int MaxOutputLevel() const override { return 0; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+
+ private:
+ Compaction* PickTTLCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+
+ Compaction* PickSizeCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+
+ Compaction* PickCompactionToWarm(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer);
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..b689b6add
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,841 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_level.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ if (!vstorage->ExpiredTtlFiles().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
+ return true;
+ }
+ for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+ if (vstorage->CompactionScore(i) >= 1) {
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+ LevelCompactionBuilder(const std::string& cf_name,
+ VersionStorageInfo* vstorage,
+ SequenceNumber earliest_mem_seqno,
+ CompactionPicker* compaction_picker,
+ LogBuffer* log_buffer,
+ const MutableCFOptions& mutable_cf_options,
+ const ImmutableOptions& ioptions,
+ const MutableDBOptions& mutable_db_options)
+ : cf_name_(cf_name),
+ vstorage_(vstorage),
+ earliest_mem_seqno_(earliest_mem_seqno),
+ compaction_picker_(compaction_picker),
+ log_buffer_(log_buffer),
+ mutable_cf_options_(mutable_cf_options),
+ ioptions_(ioptions),
+ mutable_db_options_(mutable_db_options) {}
+
+ // Pick and return a compaction.
+ Compaction* PickCompaction();
+
+ // Pick the initial files to compact to the next level. (or together
+ // in Intra-L0 compactions)
+ void SetupInitialFiles();
+
+ // If the initial files are from L0 level, pick other L0
+ // files if needed.
+ bool SetupOtherL0FilesIfNeeded();
+
+ // Compaction with round-robin compaction priority allows more files to be
+ // picked to form a large compaction
+ void SetupOtherFilesWithRoundRobinExpansion();
+ // Based on initial files, setup other files need to be compacted
+ // in this compaction, accordingly.
+ bool SetupOtherInputsIfNeeded();
+
+ Compaction* GetCompaction();
+
+ // For the specfied level, pick a file that we want to compact.
+ // Returns false if there is no file to compact.
+ // If it returns true, inputs->files.size() will be exactly one for
+ // all compaction priorities except round-robin. For round-robin,
+ // multiple consecutive files may be put into inputs->files.
+ // If level is 0 and there is already a compaction on that level, this
+ // function will return false.
+ bool PickFileToCompact();
+
+ // Return true if a L0 trivial move is picked up.
+ bool TryPickL0TrivialMove();
+
+ // For L0->L0, picks the longest span of files that aren't currently
+ // undergoing compaction for which work-per-deleted-file decreases. The span
+ // always starts from the newest L0 file.
+ //
+ // Intra-L0 compaction is independent of all other files, so it can be
+ // performed even when L0->base_level compactions are blocked.
+ //
+ // Returns true if `inputs` is populated with a span of files to be compacted;
+ // otherwise, returns false.
+ bool PickIntraL0Compaction();
+
+ // Return true if TrivialMove is extended. `start_index` is the index of
+ // the intiial file picked, which should already be in `start_level_inputs_`.
+ bool TryExtendNonL0TrivialMove(int start_index);
+
+ // Picks a file from level_files to compact.
+ // level_files is a vector of (level, file metadata) in ascending order of
+ // level. If compact_to_next_level is true, compact the file to the next
+ // level, otherwise, compact to the same level as the input file.
+ void PickFileToCompact(
+ const autovector<std::pair<int, FileMetaData*>>& level_files,
+ bool compact_to_next_level);
+
+ const std::string& cf_name_;
+ VersionStorageInfo* vstorage_;
+ SequenceNumber earliest_mem_seqno_;
+ CompactionPicker* compaction_picker_;
+ LogBuffer* log_buffer_;
+ int start_level_ = -1;
+ int output_level_ = -1;
+ int parent_index_ = -1;
+ int base_index_ = -1;
+ double start_level_score_ = 0;
+ bool is_manual_ = false;
+ bool is_l0_trivial_move_ = false;
+ CompactionInputFiles start_level_inputs_;
+ std::vector<CompactionInputFiles> compaction_inputs_;
+ CompactionInputFiles output_level_inputs_;
+ std::vector<FileMetaData*> grandparents_;
+ CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+ const MutableCFOptions& mutable_cf_options_;
+ const ImmutableOptions& ioptions_;
+ const MutableDBOptions& mutable_db_options_;
+ // Pick a path ID to place a newly generated file, with its level
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickFileToCompact(
+ const autovector<std::pair<int, FileMetaData*>>& level_files,
+ bool compact_to_next_level) {
+ for (auto& level_file : level_files) {
+ // If it's being compacted it has nothing to do here.
+ // If this assert() fails that means that some function marked some
+ // files as being_compacted, but didn't call ComputeCompactionScore()
+ assert(!level_file.second->being_compacted);
+ start_level_ = level_file.first;
+ if ((compact_to_next_level &&
+ start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+ (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty())) {
+ continue;
+ }
+ if (compact_to_next_level) {
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+ } else {
+ output_level_ = start_level_;
+ }
+ start_level_inputs_.files = {level_file.second};
+ start_level_inputs_.level = start_level_;
+ if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_)) {
+ return;
+ }
+ }
+ start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+ // Find the compactions by size on all levels.
+ bool skipped_l0_to_base = false;
+ for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+ start_level_score_ = vstorage_->CompactionScore(i);
+ start_level_ = vstorage_->CompactionScoreLevel(i);
+ assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+ if (start_level_score_ >= 1) {
+ if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+ // If L0->base_level compaction is pending, don't schedule further
+ // compaction from base level. Otherwise L0->base_level compaction
+ // may starve.
+ continue;
+ }
+ output_level_ =
+ (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+ if (PickFileToCompact()) {
+ // found the compaction!
+ if (start_level_ == 0) {
+ // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ } else {
+ // L1+ score = `Level files size` / `MaxBytesForLevel`
+ compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+ }
+ break;
+ } else {
+ // didn't find the compaction, clear the inputs
+ start_level_inputs_.clear();
+ if (start_level_ == 0) {
+ skipped_l0_to_base = true;
+ // L0->base_level may be blocked due to ongoing L0->base_level
+ // compactions. It may also be blocked by an ongoing compaction from
+ // base_level downwards.
+ //
+ // In these cases, to reduce L0 file count and thus reduce likelihood
+ // of write stalls, we can attempt compacting a span of files within
+ // L0.
+ if (PickIntraL0Compaction()) {
+ output_level_ = 0;
+ compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+ break;
+ }
+ }
+ }
+ } else {
+ // Compaction scores are sorted in descending order, no further scores
+ // will be >= 1.
+ break;
+ }
+ }
+ if (!start_level_inputs_.empty()) {
+ return;
+ }
+
+ // if we didn't find a compaction, check if there are any files marked for
+ // compaction
+ parent_index_ = base_index_ = -1;
+
+ compaction_picker_->PickFilesMarkedForCompaction(
+ cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+ return;
+ }
+
+ // Bottommost Files Compaction on deleting tombstones
+ PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kBottommostFiles;
+ return;
+ }
+
+ // TTL Compaction
+ if (ioptions_.compaction_pri == kRoundRobin &&
+ !vstorage_->ExpiredTtlFiles().empty()) {
+ auto expired_files = vstorage_->ExpiredTtlFiles();
+ // the expired files list should already be sorted by level
+ start_level_ = expired_files.front().first;
+#ifndef NDEBUG
+ for (const auto& file : expired_files) {
+ assert(start_level_ <= file.first);
+ }
+#endif
+ if (start_level_ > 0) {
+ output_level_ = start_level_ + 1;
+ if (PickFileToCompact()) {
+ compaction_reason_ = CompactionReason::kRoundRobinTtl;
+ return;
+ }
+ }
+ }
+
+ PickFileToCompact(vstorage_->ExpiredTtlFiles(), true);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kTtl;
+ return;
+ }
+
+ // Periodic Compaction
+ PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kPeriodicCompaction;
+ return;
+ }
+
+ // Forced blob garbage collection
+ PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
+ if (!start_level_inputs_.empty()) {
+ compaction_reason_ = CompactionReason::kForcedBlobGC;
+ return;
+ }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+ if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
+ return compaction_picker_->GetOverlappingL0Files(
+ vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+ }
+ return true;
+}
+
+void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
+ // We only expand when the start level is not L0 under round robin
+ assert(start_level_ >= 1);
+
+ // For round-robin compaction priority, we have 3 constraints when picking
+ // multiple files.
+ // Constraint 1: We can only pick consecutive files
+ // -> Constraint 1a: When a file is being compacted (or some input files
+ // are being compacted after expanding, we cannot
+ // choose it and have to stop choosing more files
+ // -> Constraint 1b: When we reach the last file (with largest keys), we
+ // cannot choose more files (the next file will be the
+ // first one)
+ // Constraint 2: We should ensure the total compaction bytes (including the
+ // overlapped files from the next level) is no more than
+ // mutable_cf_options_.max_compaction_bytes
+ // Constraint 3: We try our best to pick as many files as possible so that
+ // the post-compaction level size is less than
+ // MaxBytesForLevel(start_level_)
+ // Constraint 4: We do not expand if it is possible to apply a trivial move
+ // Constraint 5 (TODO): Try to pick minimal files to split into the target
+ // number of subcompactions
+ TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
+
+ // Only expand the inputs when we have selected a file in start_level_inputs_
+ if (start_level_inputs_.size() == 0) return;
+
+ uint64_t start_lvl_bytes_no_compacting = 0;
+ uint64_t curr_bytes_to_compact = 0;
+ uint64_t start_lvl_max_bytes_to_compact = 0;
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+ // Constraint 3 (pre-calculate the ideal max bytes to compact)
+ for (auto f : level_files) {
+ if (!f->being_compacted) {
+ start_lvl_bytes_no_compacting += f->fd.GetFileSize();
+ }
+ }
+ if (start_lvl_bytes_no_compacting >
+ vstorage_->MaxBytesForLevel(start_level_)) {
+ start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
+ vstorage_->MaxBytesForLevel(start_level_);
+ }
+
+ size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
+ InternalKey smallest, largest;
+ // Constraint 4 (No need to check again later)
+ compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (output_level_inputs.empty()) {
+ if (TryExtendNonL0TrivialMove((int)start_index)) {
+ return;
+ }
+ }
+ // Constraint 3
+ if (start_level_inputs_[0]->fd.GetFileSize() >=
+ start_lvl_max_bytes_to_compact) {
+ return;
+ }
+ CompactionInputFiles tmp_start_level_inputs;
+ tmp_start_level_inputs = start_level_inputs_;
+ // TODO (zichen): Future parallel round-robin may also need to update this
+ // Constraint 1b (only expand till the end)
+ for (size_t i = start_index + 1; i < level_files.size(); i++) {
+ auto* f = level_files[i];
+ if (f->being_compacted) {
+ // Constraint 1a
+ return;
+ }
+
+ tmp_start_level_inputs.files.push_back(f);
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &tmp_start_level_inputs) ||
+ compaction_picker_->FilesRangeOverlapWithCompaction(
+ {tmp_start_level_inputs}, output_level_,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level_, output_level_))) {
+ // Constraint 1a
+ tmp_start_level_inputs.clear();
+ return;
+ }
+
+ curr_bytes_to_compact = 0;
+ for (auto start_lvl_f : tmp_start_level_inputs.files) {
+ curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
+ }
+
+ // Check whether any output level files are locked
+ compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (!output_level_inputs.empty() &&
+ !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &output_level_inputs)) {
+ // Constraint 1a
+ tmp_start_level_inputs.clear();
+ return;
+ }
+
+ uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
+ for (auto output_lvl_f : output_level_inputs.files) {
+ curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
+ }
+ if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
+ // Constraint 2
+ tmp_start_level_inputs.clear();
+ return;
+ }
+
+ start_level_inputs_.files = tmp_start_level_inputs.files;
+ // Constraint 3
+ if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
+ return;
+ }
+ }
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+ // Setup input files from output level. For output to L0, we only compact
+ // spans of files that do not interact with any pending compactions, so don't
+ // need to consider other levels.
+ if (output_level_ != 0) {
+ output_level_inputs_.level = output_level_;
+ bool round_robin_expanding =
+ ioptions_.compaction_pri == kRoundRobin &&
+ compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
+ if (round_robin_expanding) {
+ SetupOtherFilesWithRoundRobinExpansion();
+ }
+ if (!is_l0_trivial_move_ &&
+ !compaction_picker_->SetupOtherInputs(
+ cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+ &output_level_inputs_, &parent_index_, base_index_,
+ round_robin_expanding)) {
+ return false;
+ }
+
+ compaction_inputs_.push_back(start_level_inputs_);
+ if (!output_level_inputs_.empty()) {
+ compaction_inputs_.push_back(output_level_inputs_);
+ }
+
+ if (!is_l0_trivial_move_) {
+ // In some edge cases we could pick a compaction that will be compacting
+ // a key range that overlap with another running compaction, and both
+ // of them have the same output level. This could happen if
+ // (1) we are running a non-exclusive manual compaction
+ // (2) AddFile ingest a new file into the LSM tree
+ // We need to disallow this from happening.
+ if (compaction_picker_->FilesRangeOverlapWithCompaction(
+ compaction_inputs_, output_level_,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level_, output_level_))) {
+ // This compaction output could potentially conflict with the output
+ // of a currently running compaction, we cannot run it.
+ return false;
+ }
+ compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+ output_level_inputs_, &grandparents_);
+ }
+ } else {
+ compaction_inputs_.push_back(start_level_inputs_);
+ }
+ return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+ // Pick up the first file to start compaction. It may have been extended
+ // to a clean cut.
+ SetupInitialFiles();
+ if (start_level_inputs_.empty()) {
+ return nullptr;
+ }
+ assert(start_level_ >= 0 && output_level_ >= 0);
+
+ // If it is a L0 -> base level compaction, we need to set up other L0
+ // files if needed.
+ if (!SetupOtherL0FilesIfNeeded()) {
+ return nullptr;
+ }
+
+ // Pick files in the output level and expand more files in the start level
+ // if needed.
+ if (!SetupOtherInputsIfNeeded()) {
+ return nullptr;
+ }
+
+ // Form a compaction object containing the files we picked.
+ Compaction* c = GetCompaction();
+
+ TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+ return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+ auto c = new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(compaction_inputs_), output_level_,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+ ioptions_.compaction_style, vstorage_->base_level(),
+ ioptions_.level_compaction_dynamic_level_bytes),
+ mutable_cf_options_.max_compaction_bytes,
+ GetPathId(ioptions_, mutable_cf_options_, output_level_),
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
+ vstorage_->base_level()),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+ /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
+ compaction_reason_);
+
+ // If it's level 0 compaction, make sure we don't execute any other level 0
+ // compactions in parallel
+ compaction_picker_->RegisterCompaction(c);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, int level) {
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+
+ // size remaining in the most recent path
+ uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+ uint64_t level_size;
+ int cur_level = 0;
+
+ // max_bytes_for_level_base denotes L1 size.
+ // We estimate L0 size to be the same as L1.
+ level_size = mutable_cf_options.max_bytes_for_level_base;
+
+ // Last path is the fallback
+ while (p < ioptions.cf_paths.size() - 1) {
+ if (level_size <= current_path_size) {
+ if (cur_level == level) {
+ // Does desired level fit in this path?
+ return p;
+ } else {
+ current_path_size -= level_size;
+ if (cur_level > 0) {
+ if (ioptions.level_compaction_dynamic_level_bytes) {
+ // Currently, level_compaction_dynamic_level_bytes is ignored when
+ // multiple db paths are specified. https://github.com/facebook/
+ // rocksdb/blob/main/db/column_family.cc.
+ // Still, adding this check to avoid accidentally using
+ // max_bytes_for_level_multiplier_additional
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+ } else {
+ level_size = static_cast<uint64_t>(
+ level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+ mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+ }
+ }
+ cur_level++;
+ continue;
+ }
+ }
+ p++;
+ current_path_size = ioptions.cf_paths[p].target_size;
+ }
+ return p;
+}
+
+bool LevelCompactionBuilder::TryPickL0TrivialMove() {
+ if (vstorage_->base_level() <= 0) {
+ return false;
+ }
+ if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
+ !vstorage_->LevelFiles(output_level_).empty() &&
+ ioptions_.db_paths.size() <= 1) {
+ // Try to pick trivial move from L0 to L1. We start from the oldest
+ // file. We keep expanding to newer files if it would form a
+ // trivial move.
+ // For now we don't support it with
+ // mutable_cf_options_.compression_per_level to prevent the logic
+ // of determining whether L0 can be trivial moved to the next level.
+ // We skip the case where output level is empty, since in this case, at
+ // least the oldest file would qualify for trivial move, and this would
+ // be a surprising behavior with few benefits.
+
+ // We search from the oldest file from the newest. In theory, there are
+ // files in the middle can form trivial move too, but it is probably
+ // uncommon and we ignore these cases for simplicity.
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+
+ InternalKey my_smallest, my_largest;
+ for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ FileMetaData* file = *it;
+ if (it == level_files.rbegin()) {
+ my_smallest = file->smallest;
+ my_largest = file->largest;
+ } else {
+ if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
+ 0) {
+ my_smallest = file->smallest;
+ } else if (compaction_picker_->icmp()->Compare(file->smallest,
+ my_largest) > 0) {
+ my_largest = file->largest;
+ } else {
+ break;
+ }
+ }
+ vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
+ &output_level_inputs.files);
+ if (output_level_inputs.empty()) {
+ assert(!file->being_compacted);
+ start_level_inputs_.files.push_back(file);
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (!start_level_inputs_.empty()) {
+ // Sort files by key range. Not sure it's 100% necessary but it's cleaner
+ // to always keep files sorted by key the key ranges don't overlap.
+ std::sort(start_level_inputs_.files.begin(),
+ start_level_inputs_.files.end(),
+ [icmp = compaction_picker_->icmp()](FileMetaData* f1,
+ FileMetaData* f2) -> bool {
+ return (icmp->Compare(f1->smallest, f2->smallest) < 0);
+ });
+
+ is_l0_trivial_move_ = true;
+ return true;
+ }
+ return false;
+}
+
+bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
+ if (start_level_inputs_.size() == 1 &&
+ (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
+ (mutable_cf_options_.compression_per_level.empty())) {
+ // Only file of `index`, and it is likely a trivial move. Try to
+ // expand if it is still a trivial move, but not beyond
+ // max_compaction_bytes or 4 files, so that we don't create too
+ // much compaction pressure for the next level.
+ // Ignore if there are more than one DB path, as it would be hard
+ // to predict whether it is a trivial move.
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+ const size_t kMaxMultiTrivialMove = 4;
+ FileMetaData* initial_file = start_level_inputs_.files[0];
+ size_t total_size = initial_file->fd.GetFileSize();
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ for (int i = start_index + 1;
+ i < static_cast<int>(level_files.size()) &&
+ start_level_inputs_.size() < kMaxMultiTrivialMove;
+ i++) {
+ FileMetaData* next_file = level_files[i];
+ if (next_file->being_compacted) {
+ break;
+ }
+ vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
+ &(next_file->largest),
+ &output_level_inputs.files);
+ if (!output_level_inputs.empty()) {
+ break;
+ }
+ if (i < static_cast<int>(level_files.size()) - 1 &&
+ compaction_picker_->icmp()
+ ->user_comparator()
+ ->CompareWithoutTimestamp(
+ next_file->largest.user_key(),
+ level_files[i + 1]->smallest.user_key()) == 0) {
+ TEST_SYNC_POINT_CALLBACK(
+ "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
+ nullptr);
+ // Not a clean up after adding the next file. Skip.
+ break;
+ }
+ total_size += next_file->fd.GetFileSize();
+ if (total_size > mutable_cf_options_.max_compaction_bytes) {
+ break;
+ }
+ start_level_inputs_.files.push_back(next_file);
+ }
+ return start_level_inputs_.size() > 1;
+ }
+ return false;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+ // level 0 files are overlapping. So we cannot pick more
+ // than one concurrent compactions at this level. This
+ // could be made better by looking at key-ranges that are
+ // being compacted at level 0.
+ if (start_level_ == 0 &&
+ !compaction_picker_->level0_compactions_in_progress()->empty()) {
+ TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+ return false;
+ }
+
+ start_level_inputs_.clear();
+ start_level_inputs_.level = start_level_;
+
+ assert(start_level_ >= 0);
+
+ if (TryPickL0TrivialMove()) {
+ return true;
+ }
+
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(start_level_);
+
+ // Pick the file with the highest score in this level that is not already
+ // being compacted.
+ const std::vector<int>& file_scores =
+ vstorage_->FilesByCompactionPri(start_level_);
+
+ unsigned int cmp_idx;
+ for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+ cmp_idx < file_scores.size(); cmp_idx++) {
+ int index = file_scores[cmp_idx];
+ auto* f = level_files[index];
+
+ // do not pick a file to compact if it is being compacted
+ // from n-1 level.
+ if (f->being_compacted) {
+ if (ioptions_.compaction_pri == kRoundRobin) {
+ // TODO(zichen): this file may be involved in one compaction from
+ // an upper level, cannot advance the cursor for round-robin policy.
+ // Currently, we do not pick any file to compact in this case. We
+ // should fix this later to ensure a compaction is picked but the
+ // cursor shall not be advanced.
+ return false;
+ }
+ continue;
+ }
+
+ start_level_inputs_.files.push_back(f);
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &start_level_inputs_) ||
+ compaction_picker_->FilesRangeOverlapWithCompaction(
+ {start_level_inputs_}, output_level_,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level_, output_level_))) {
+ // A locked (pending compaction) input-level file was pulled in due to
+ // user-key overlap.
+ start_level_inputs_.clear();
+
+ if (ioptions_.compaction_pri == kRoundRobin) {
+ return false;
+ }
+ continue;
+ }
+
+ // Now that input level is fully expanded, we check whether any output
+ // files are locked due to pending compaction.
+ //
+ // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+ // level files are locked, not just the extra ones pulled in for user-key
+ // overlap.
+ InternalKey smallest, largest;
+ compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+ CompactionInputFiles output_level_inputs;
+ output_level_inputs.level = output_level_;
+ vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+ &output_level_inputs.files);
+ if (output_level_inputs.empty()) {
+ if (TryExtendNonL0TrivialMove(index)) {
+ break;
+ }
+ } else {
+ if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &output_level_inputs)) {
+ start_level_inputs_.clear();
+ if (ioptions_.compaction_pri == kRoundRobin) {
+ return false;
+ }
+ continue;
+ }
+ }
+
+ base_index_ = index;
+ break;
+ }
+
+ // store where to start the iteration in the next call to PickCompaction
+ if (ioptions_.compaction_pri != kRoundRobin) {
+ vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+ }
+ return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+ start_level_inputs_.clear();
+ const std::vector<FileMetaData*>& level_files =
+ vstorage_->LevelFiles(0 /* level */);
+ if (level_files.size() <
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+ level_files[0]->being_compacted) {
+ // If L0 isn't accumulating much files beyond the regular trigger, don't
+ // resort to L0->L0 compaction yet.
+ return false;
+ }
+ return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+ std::numeric_limits<uint64_t>::max(),
+ mutable_cf_options_.max_compaction_bytes,
+ &start_level_inputs_, earliest_mem_seqno_);
+}
+} // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
+ LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+ log_buffer, mutable_cf_options, ioptions_,
+ mutable_db_options);
+ return builder.PickCompaction();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..42a9b60a6
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+ LevelCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..2e2e566c0
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,3964 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/compaction/file_pri.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+ using Logger::Logv;
+ void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+ size_t log_count;
+};
+
+class CompactionPickerTestBase : public testing::Test {
+ public:
+ const Comparator* ucmp_;
+ InternalKeyComparator icmp_;
+ Options options_;
+ ImmutableOptions ioptions_;
+ MutableCFOptions mutable_cf_options_;
+ MutableDBOptions mutable_db_options_;
+ LevelCompactionPicker level_compaction_picker;
+ std::string cf_name_;
+ CountingLogger logger_;
+ LogBuffer log_buffer_;
+ uint32_t file_num_;
+ CompactionOptionsFIFO fifo_options_;
+ std::unique_ptr<VersionStorageInfo> vstorage_;
+ std::vector<std::unique_ptr<FileMetaData>> files_;
+ // does not own FileMetaData
+ std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+ // input files to compaction process.
+ std::vector<CompactionInputFiles> input_files_;
+ int compaction_level_start_;
+
+ explicit CompactionPickerTestBase(const Comparator* _ucmp)
+ : ucmp_(_ucmp),
+ icmp_(ucmp_),
+ options_(CreateOptions(ucmp_)),
+ ioptions_(options_),
+ mutable_cf_options_(options_),
+ mutable_db_options_(),
+ level_compaction_picker(ioptions_, &icmp_),
+ cf_name_("dummy"),
+ log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+ file_num_(1),
+ vstorage_(nullptr) {
+ mutable_cf_options_.ttl = 0;
+ mutable_cf_options_.periodic_compaction_seconds = 0;
+ // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+ // tests to cover.
+ ioptions_.compaction_pri = kByCompensatedSize;
+ fifo_options_.max_table_files_size = 1;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ ioptions_.cf_paths.emplace_back("dummy",
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ ~CompactionPickerTestBase() override {}
+
+ void NewVersionStorage(int num_levels, CompactionStyle style) {
+ DeleteVersionStorage();
+ options_.num_levels = num_levels;
+ vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+ style, nullptr, false));
+ vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+ }
+
+ // Create a new VersionStorageInfo object so we can add mode files and then
+ // merge it with the existing VersionStorageInfo
+ void AddVersionStorage() {
+ temp_vstorage_.reset(new VersionStorageInfo(
+ &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style,
+ vstorage_.get(), false));
+ }
+
+ void DeleteVersionStorage() {
+ vstorage_.reset();
+ temp_vstorage_.reset();
+ files_.clear();
+ file_map_.clear();
+ input_files_.clear();
+ }
+
+ // REQUIRES: smallest and largest are c-style strings ending with '\0'
+ void Add(int level, uint32_t file_number, const char* smallest,
+ const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+ SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+ size_t compensated_file_size = 0, bool marked_for_compact = false,
+ Temperature temperature = Temperature::kUnknown,
+ uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime,
+ Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) {
+ assert(ts_of_smallest.size() == ucmp_->timestamp_size());
+ assert(ts_of_largest.size() == ucmp_->timestamp_size());
+
+ VersionStorageInfo* vstorage;
+ if (temp_vstorage_) {
+ vstorage = temp_vstorage_.get();
+ } else {
+ vstorage = vstorage_.get();
+ }
+ assert(level < vstorage->num_levels());
+ char* smallest_key_buf = nullptr;
+ char* largest_key_buf = nullptr;
+
+ if (!ts_of_smallest.empty()) {
+ smallest_key_buf = new char[strlen(smallest) + ucmp_->timestamp_size()];
+ memcpy(smallest_key_buf, smallest, strlen(smallest));
+ memcpy(smallest_key_buf + strlen(smallest), ts_of_smallest.data(),
+ ucmp_->timestamp_size());
+ largest_key_buf = new char[strlen(largest) + ucmp_->timestamp_size()];
+ memcpy(largest_key_buf, largest, strlen(largest));
+ memcpy(largest_key_buf + strlen(largest), ts_of_largest.data(),
+ ucmp_->timestamp_size());
+ }
+
+ InternalKey smallest_ikey = InternalKey(
+ smallest_key_buf ? Slice(smallest_key_buf,
+ ucmp_->timestamp_size() + strlen(smallest))
+ : smallest,
+ smallest_seq, kTypeValue);
+ InternalKey largest_ikey = InternalKey(
+ largest_key_buf
+ ? Slice(largest_key_buf, ucmp_->timestamp_size() + strlen(largest))
+ : largest,
+ largest_seq, kTypeValue);
+
+ FileMetaData* f = new FileMetaData(
+ file_number, path_id, file_size, smallest_ikey, largest_ikey,
+ smallest_seq, largest_seq, marked_for_compact, temperature,
+ kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ f->compensated_file_size =
+ (compensated_file_size != 0) ? compensated_file_size : file_size;
+ f->oldest_ancester_time = oldest_ancestor_time;
+ vstorage->AddFile(level, f);
+ files_.emplace_back(f);
+ file_map_.insert({file_number, {f, level}});
+
+ delete[] smallest_key_buf;
+ delete[] largest_key_buf;
+ }
+
+ void SetCompactionInputFilesLevels(int level_count, int start_level) {
+ input_files_.resize(level_count);
+ for (int i = 0; i < level_count; ++i) {
+ input_files_[i].level = start_level + i;
+ }
+ compaction_level_start_ = start_level;
+ }
+
+ void AddToCompactionFiles(uint32_t file_number) {
+ auto iter = file_map_.find(file_number);
+ assert(iter != file_map_.end());
+ int level = iter->second.second;
+ assert(level < vstorage_->num_levels());
+ input_files_[level - compaction_level_start_].files.emplace_back(
+ iter->second.first);
+ }
+
+ void UpdateVersionStorageInfo() {
+ if (temp_vstorage_) {
+ VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
+ vstorage_.get(), nullptr);
+ ASSERT_OK(builder.SaveTo(temp_vstorage_.get()));
+ vstorage_ = std::move(temp_vstorage_);
+ }
+ vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ vstorage_->SetFinalized();
+ }
+
+ private:
+ Options CreateOptions(const Comparator* ucmp) const {
+ Options opts;
+ opts.comparator = ucmp;
+ return opts;
+ }
+
+ std::unique_ptr<VersionStorageInfo> temp_vstorage_;
+};
+
+class CompactionPickerTest : public CompactionPickerTestBase {
+ public:
+ explicit CompactionPickerTest()
+ : CompactionPickerTestBase(BytewiseComparator()) {}
+
+ ~CompactionPickerTest() override {}
+};
+
+class CompactionPickerU64TsTest : public CompactionPickerTestBase {
+ public:
+ explicit CompactionPickerU64TsTest()
+ : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {}
+
+ ~CompactionPickerU64TsTest() override {}
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ UpdateVersionStorageInfo();
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "p", "q");
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+ mutable_cf_options_.target_file_size_base = 10000000000;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(1, 66U, "150", "200", 1000000001U);
+ Add(1, 88U, "201", "300", 1000000000U);
+ Add(2, 6U, "150", "179", 1000000000U);
+ Add(2, 7U, "180", "220", 1000000000U);
+ Add(2, 8U, "221", "300", 1000000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+ Add(0, 1U, "150", "200", 1000000U);
+ // Level 1 score 1.2
+ Add(1, 66U, "150", "200", 6000000U);
+ Add(1, 88U, "201", "300", 6000000U);
+ // Level 2 score 1.8. File 7 is the largest. Should be picked
+ Add(2, 6U, "150", "179", 60000000U);
+ Add(2, 7U, "180", "220", 60000001U);
+ Add(2, 8U, "221", "300", 60000000U);
+ // Level 3 score slightly larger than 1
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+ mutable_cf_options_.target_file_size_base / 10,
+ compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+ const int kLevels = 6;
+ const int kFileCount = 20;
+
+ for (int level = 0; level < kLevels - 1; ++level) {
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+ for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+ // start a brand new version in each test.
+ NewVersionStorage(kLevels, kCompactionStyleLevel);
+ for (int i = 0; i < file_count; ++i) {
+ Add(level, i, std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), file_size, 0,
+ i * 100, i * 100 + 99);
+ }
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ // release the version storage
+ DeleteVersionStorage();
+ }
+ }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(0, 2U, "200", "250");
+ Add(num_levels - 1, 3U, "200", "250", 300U);
+ Add(num_levels - 1, 4U, "300", "350", 3000U);
+ Add(num_levels - 3, 5U, "150", "180", 3U);
+ Add(num_levels - 3, 6U, "181", "300", 3U);
+ Add(num_levels - 3, 7U, "400", "450", 3U);
+
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(num_levels - 3, compaction->level(1));
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+ ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200");
+ Add(num_levels - 1, 2U, "200", "250", 300U);
+ Add(num_levels - 1, 3U, "300", "350", 3000U);
+ Add(num_levels - 1, 4U, "400", "450", 3U);
+ Add(num_levels - 2, 5U, "150", "180", 300U);
+ Add(num_levels - 2, 6U, "181", "350", 500U);
+ Add(num_levels - 2, 7U, "400", "450", 200U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(0, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ // verify the trigger given different number of L0 files.
+ for (int i = 1;
+ i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+ i * 100 + 99);
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+ const uint64_t kFileSize = 100000;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ ioptions_.allow_ingest_behind = true;
+ ioptions_.num_levels = 3;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // output level should be the one above the bottom-most
+ ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+ false);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+ Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+ // The case where universal periodic compaction can be picked
+ // with some newer files being compacted.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+ // The case where universal periodic compaction does not
+ // pick up only level to compact if it doesn't cover
+ // any file marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+ // The case where universal periodic compaction does not
+ // pick up only the last sorted run which is an L0 file if it isn't
+ // marked as periodic compaction.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[5].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+ // The case where universal periodic compaction couldn't form
+ // a compaction that includes any file marked for periodic compaction.
+ // Right now we form the compaction anyway if it is more than one
+ // sorted run. Just put the case here to validate that it doesn't
+ // crash.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(!compaction ||
+ compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+ // Test single L0 file periodic compaction triggering.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+ // Test single sorted run non-L0 periodic compaction
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.periodic_compaction_seconds = 1000;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+ Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+ UpdateVersionStorageInfo();
+ vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 555555;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "310", "380", kFileSize, 0, 200, 251);
+ Add(3, 6U, "410", "880", kFileSize, 0, 200, 251);
+ Add(3, 7U, "910", "980", 1, 0, 200, 251);
+ Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+ Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+ Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+ Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+ Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+ // Add(4, 15U, "960", "970", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(3, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+ // ASSERT_EQ(4U, compaction->num_input_files(1));
+ ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber());
+ ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 400000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(1, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(2, 5U, "310", "380", kFileSize, 0, 200, 251);
+ Add(2, 6U, "410", "880", kFileSize, 0, 200, 251);
+ Add(2, 7U, "910", "980", kFileSize, 0, 200, 251);
+ Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+ Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+ Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+ Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+ Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(2, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
+ // Test bottom level files falling between gaps between two upper level
+ // files
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 300000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+ Add(3, 5U, "000", "180", kFileSize, 0, 200, 251);
+ Add(3, 6U, "181", "190", kFileSize, 0, 200, 251);
+ Add(3, 7U, "710", "810", kFileSize, 0, 200, 251);
+ Add(3, 8U, "820", "830", kFileSize, 0, 200, 251);
+ Add(3, 9U, "900", "991", kFileSize, 0, 200, 251);
+ Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+ Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+ Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+ Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+ Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+ Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(2, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(0, compaction->num_input_files(2));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
+ // Test compaction candidates always cover many files.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 3200000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+ // Generate files like following:
+ // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+ // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+ for (int i = 11; i < 79; i++) {
+ Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+ std::to_string(i * 100 + 80).c_str(), kFileSize, 0, 200, 251);
+ // Add a tie breaker
+ if (i == 66) {
+ Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251);
+ }
+
+ Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+ std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+ Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+ std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+ }
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(3, compaction->start_level());
+ ASSERT_EQ(6U, compaction->num_input_files(0));
+ ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber());
+ ASSERT_EQ(11, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
+ // Test compaction candidates always cover many files with some single
+ // files larger than size threshold.
+ const uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.max_compaction_bytes = 3200000;
+ mutable_cf_options_.compaction_options_universal.incremental = true;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 30;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+ // Generate files like following:
+ // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+ // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+ for (int i = 11; i < 70; i++) {
+ Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+ std::to_string(i * 100 + 80).c_str(),
+ i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251);
+
+ Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+ std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+ Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+ std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+ }
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction);
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(3, compaction->start_level());
+ ASSERT_EQ(6U, compaction->num_input_files(0));
+ ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber());
+ ASSERT_EQ(13, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const int kFileCount =
+ mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+ // verify whether compaction is needed based on the current
+ // size of L0 files.
+ for (int i = 1; i <= kFileCount; ++i) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+ std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100,
+ i * 100 + 99);
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+ vstorage_->CompactionScore(0) >= 1);
+ }
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm1) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm2) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kUnknown, threshold_time - 5000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kWarm, threshold_time - 5000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kUnknown, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kWarm, threshold_time - 5000);
+ file_map_[2].first->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ // Stop if a file is being compacted
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
+ NewVersionStorage(1, kCompactionStyleFIFO);
+ const uint64_t kFileSize = 100000;
+ const uint64_t kMaxSize = kFileSize * 100000;
+ uint64_t kWarmThreshold = 2000;
+
+ fifo_options_.max_table_files_size = kMaxSize;
+ fifo_options_.age_for_warm = kWarmThreshold;
+ mutable_cf_options_.compaction_options_fifo = fifo_options_;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+ FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+ int64_t current_time = 0;
+ ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+ uint64_t threshold_time =
+ static_cast<uint64_t>(current_time) - kWarmThreshold;
+ Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+ Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+ Temperature::kUnknown, threshold_time + 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+ Temperature::kUnknown, threshold_time - 2000);
+ Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+ Temperature::kWarm, threshold_time - 3000);
+ Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+ Temperature::kUnknown, threshold_time - 4000);
+ Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+ Temperature::kWarm, threshold_time - 5000);
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+ std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ // Stop if a file is being compacted
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 100000000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+ mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+ Add(2, 6U, "150", "179", 50000000U);
+ Add(2, 7U, "180", "220", 50000000U);
+ Add(2, 8U, "321", "400", 50000000U); // File not overlapping
+ Add(2, 9U, "721", "800", 50000000U);
+
+ Add(3, 26U, "150", "170", 260000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 260000000U);
+ Add(3, 30U, "750", "900", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Pick file 8 because it overlaps with 0 files on level 3.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+ // Compaction input size * 1.1
+ ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.target_file_size_base = 10000000;
+ mutable_cf_options_.target_file_size_multiplier = 10;
+ mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+ Add(2, 6U, "150", "175",
+ 60000000U); // Overlaps with file 26, 27, total size 521M
+ Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size
+ // 520M, the smallest overlapping
+ Add(2, 8U, "201", "300",
+ 60000000U); // Overlaps with file 28, 29, total size 521M
+
+ Add(3, 25U, "100", "110", 261000000U);
+ Add(3, 26U, "150", "170", 261000000U);
+ Add(3, 27U, "171", "179", 260000000U);
+ Add(3, 28U, "191", "220", 260000000U);
+ Add(3, 29U, "221", "300", 261000000U);
+ Add(3, 30U, "321", "400", 261000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 7 because overlapping ratio is the biggest.
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file
+ // itself is larger. Should be picked.
+
+ Add(3, 26U, "160", "165", 260000000U);
+ Add(3, 27U, "166", "170", 260000000U);
+ Add(3, 28U, "180", "400", 260000000U);
+ Add(3, 29U, "401", "500", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 10000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ // Overlaps with file 26, 27. And the file is compensated so will be
+ // picked up.
+ Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28
+
+ Add(3, 26U, "160", "165", 60000000U);
+ // Boosted file size in output level is not considered.
+ Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+ Add(3, 28U, "180", "400", 60000000U);
+ Add(3, 29U, "401", "500", 60000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriRoundRobin) {
+ std::vector<InternalKey> test_cursors = {InternalKey("249", 100, kTypeValue),
+ InternalKey("600", 100, kTypeValue),
+ InternalKey()};
+ std::vector<uint32_t> selected_files = {8U, 6U, 6U};
+
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_bytes_for_level_base = 12000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ for (size_t i = 0; i < test_cursors.size(); i++) {
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor
+ vstorage_->AddCursorForOneLevel(2, test_cursors[i]);
+ Add(2, 6U, "150", "199", 50000000U); // Overlap with 26U, 27U
+ Add(2, 7U, "200", "249", 50000000U); // File not overlapping
+ Add(2, 8U, "300", "600", 50000000U); // Overlap with 28U, 29U
+
+ Add(3, 26U, "130", "165", 60000000U);
+ Add(3, 27U, "166", "170", 60000000U);
+ Add(3, 28U, "270", "340", 60000000U);
+ Add(3, 29U, "401", "500", 60000000U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ // Since the max bytes for level 2 is 120M, picking one file to compact
+ // makes the post-compaction level size less than 120M, there is exactly one
+ // file picked for round-robin compaction
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(selected_files[i], compaction->input(0, 0)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+ }
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) {
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_compaction_bytes = 100000000u;
+ mutable_cf_options_.max_bytes_for_level_base = 120;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor (file picking should start with 7U)
+ vstorage_->AddCursorForOneLevel(2, InternalKey("199", 100, kTypeValue));
+ Add(2, 6U, "150", "199", 500U);
+ Add(2, 7U, "200", "249", 500U);
+ Add(2, 8U, "300", "600", 500U);
+ Add(2, 9U, "700", "800", 500U);
+ Add(2, 10U, "850", "950", 500U);
+
+ Add(3, 26U, "130", "165", 600U);
+ Add(3, 27U, "166", "170", 600U);
+ Add(3, 28U, "270", "340", 600U);
+ Add(3, 29U, "401", "500", 600U);
+ Add(3, 30U, "601", "800", 600U);
+ Add(3, 31U, "830", "890", 600U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+
+ // The maximum compaction bytes is very large in this case so we can igore its
+ // constraint in this test case. The maximum bytes for level 2 is 1200
+ // bytes, and thus at least 3 files should be picked so that the bytes in
+ // level 2 is less than the maximum
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(8U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(9U, compaction->input(0, 2)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) {
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_compaction_bytes = 2500u;
+ mutable_cf_options_.max_bytes_for_level_base = 120;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor (file picking should start with 6U)
+ vstorage_->AddCursorForOneLevel(2, InternalKey("1000", 100, kTypeValue));
+ Add(2, 6U, "150", "199", 500U); // Overlap with 26U, 27U
+ Add(2, 7U, "200", "249", 500U); // Overlap with 27U
+ Add(2, 8U, "300", "600", 500U); // Overlap with 28U, 29U
+ Add(2, 9U, "700", "800", 500U);
+ Add(2, 10U, "850", "950", 500U);
+
+ Add(3, 26U, "130", "165", 600U);
+ Add(3, 27U, "166", "230", 600U);
+ Add(3, 28U, "270", "340", 600U);
+ Add(3, 29U, "401", "500", 600U);
+ Add(3, 30U, "601", "800", 600U);
+ Add(3, 31U, "830", "890", 600U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+
+ // The maximum compaction bytes is only 2500 bytes now. Even though we are
+ // required to choose 3 files so that the post-compaction level size is less
+ // than 1200 bytes. We cannot pick 3 files to compact since the maximum
+ // compaction size is 2500. After picking files 6U and 7U, the number of
+ // compaction bytes has reached 2200, and thus no more space to add another
+ // input file with 50M bytes.
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(0, 1)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) {
+ ioptions_.compaction_pri = kRoundRobin;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ mutable_cf_options_.max_bytes_for_level_base = 120;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ // start a brand new version in each test.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ vstorage_->ResizeCompactCursors(6);
+ // Set the cursor (file picking should start with 9U)
+ vstorage_->AddCursorForOneLevel(2, InternalKey("700", 100, kTypeValue));
+ Add(2, 6U, "150", "199", 500U);
+ Add(2, 7U, "200", "249", 500U);
+ Add(2, 8U, "300", "600", 500U);
+ Add(2, 9U, "700", "800", 500U);
+ Add(2, 10U, "850", "950", 500U);
+
+ Add(3, 26U, "130", "165", 600U);
+ Add(3, 27U, "166", "170", 600U);
+ Add(3, 28U, "270", "340", 600U);
+ Add(3, 29U, "401", "500", 600U);
+ Add(3, 30U, "601", "800", 600U);
+ Add(3, 31U, "830", "890", 600U);
+ UpdateVersionStorageInfo();
+ LevelCompactionPicker local_level_compaction_picker =
+ LevelCompactionPicker(ioptions_, &icmp_);
+ std::unique_ptr<Compaction> compaction(
+ local_level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+
+ // Cannot pick more files since we reach the last file in level 2
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(9U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(10U, compaction->input(0, 1)->fd.GetNumber());
+ // release the version storage
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ mutable_cf_options_.max_bytes_for_level_base = 15000000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+ // file 7 and 8 over lap with the same file, but file 8 is smaller so
+ // it will be picked.
+ Add(2, 13U, "010", "011",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 14U, "020", "021",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 15U, "030", "031",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 16U, "040", "041",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 17U, "050", "051",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 18U, "060", "061",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 19U, "070", "071",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 20U, "080", "081",
+ 6100U); // Overlaps with a large file. Not picked
+
+ Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27
+ Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27
+ Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file
+ // itself is larger. Should be picked.
+ Add(2, 9U, "610", "611",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 10U, "620", "621",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 11U, "630", "631",
+ 6100U); // Overlaps with a large file. Not picked
+ Add(2, 12U, "640", "641",
+ 6100U); // Overlaps with a large file. Not picked
+
+ Add(3, 31U, "001", "100", 260000000U);
+ Add(3, 26U, "160", "165", 260000000U);
+ Add(3, 27U, "166", "170", 260000000U);
+ Add(3, 28U, "180", "400", 260000000U);
+ Add(3, 29U, "401", "500", 260000000U);
+ Add(3, 30U, "601", "700", 260000000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ // Picking file 8 because overlapping ratio is the biggest.
+ ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+ int num_levels = ioptions_.num_levels;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 200;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200"); // <- marked for compaction
+ Add(1, 3U, "400", "500", 600); // <- this one needs compacting
+ Add(2, 4U, "150", "200");
+ Add(2, 5U, "201", "210");
+ Add(2, 6U, "300", "310");
+ Add(2, 7U, "400", "500"); // <- being compacted
+
+ vstorage_->LevelFiles(2)[3]->being_compacted = true;
+ vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ ioptions_.compaction_pri = kByCompensatedSize;
+
+ Add(1, 1U, "100", "150", 1U);
+ // Overlapping user keys
+ Add(1, 2U, "200", "400", 1U);
+ Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+ Add(2, 4U, "600", "700", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+ // expand multiple times)
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "200", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+ Add(1, 4U, "250", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+ Add(1, 1U, "100", "150", 1U);
+ Add(1, 2U, "150", "199", 1U, 0, 0);
+ Add(1, 3U, "200", "250", 1100000U, 0, 0);
+ Add(1, 4U, "251", "300", 1U, 0, 0);
+ Add(1, 5U, "300", "350", 1U, 0, 0);
+
+ Add(2, 6U, "100", "115", 1U);
+ Add(2, 7U, "125", "325", 1U);
+ Add(2, 8U, "350", "400", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1000000000U);
+ Add(1, 2U, "400", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1U, 0, 0);
+ Add(2, 3U, "000", "100", 1U);
+ Add(2, 4U, "100", "300", 1U, 0, 0);
+ Add(2, 5U, "305", "450", 1U, 0, 0);
+ Add(2, 6U, "460", "600", 1U, 0, 0);
+ Add(2, 7U, "600", "700", 1U, 0, 0);
+
+ vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+ vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // Overlapping user keys on same level and output level
+ Add(1, 1U, "200", "400", 1U, 0, 0);
+ Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+ Add(2, 3U, "100", "250", 1U);
+ Add(2, 4U, "300", "600", 1U, 0, 0);
+ Add(2, 5U, "600", "800", 1U, 0, 0);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_GE(1U, compaction->num_input_files(0));
+ ASSERT_GE(2U, compaction->num_input_files(1));
+ // File 5 has to be included in the compaction
+ ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // no overlapping case
+ Add(1, 1U, "101", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "150", "200", 1U);
+ Add(2, 7U, "200", "450", 1U, 0, 0);
+ Add(2, 8U, "500", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+ // grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up
+ // Expand input level as much as possible
+ // overlapping case
+ Add(1, 1U, "121", "150", 1U);
+ Add(1, 2U, "151", "200", 1U);
+ Add(1, 3U, "201", "300", 1000000000U);
+ Add(1, 4U, "301", "400", 1U);
+ Add(1, 5U, "401", "500", 1U);
+ Add(2, 6U, "100", "120", 1U);
+ Add(2, 7U, "150", "200", 1U);
+ Add(2, 8U, "200", "450", 1U, 0, 0);
+ Add(2, 9U, "501", "600", 1U);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(2U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+ ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+ // Locked file encountered when pulling in extra input-level files with same
+ // user keys. Verify we pick the next-best file from the same input level.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // file_number 2U is largest and thus first choice. But it overlaps with
+ // file_number 1U which is being compacted. So instead we pick the next-
+ // biggest file, 3U, which is eligible for compaction.
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 900000000U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "150" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+ // Locked file encountered when pulling in extra output-level files with same
+ // user keys. Expected to skip that compaction and pick the next-best choice.
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ // score(L1) = 3.7
+ // score(L2) = 1.85
+ // There is no eligible file in L1 to compact since both candidates pull in
+ // file_number 5U, which overlaps with a file pending compaction (6U). The
+ // first eligible compaction is from L2->L3.
+ Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+ "200" /* largest */, 1000000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+ "250" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 5000000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "201" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+ "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+ 0 /* largest_seq */);
+ file_map_[6U].first->being_compacted = true;
+ Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1U /* file_size */);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FileTtlBooster) {
+ // Set TTL to 2048
+ // TTL boosting for all levels starts at 1024,
+ // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960.
+ // From second last level (L5), range starts at
+ // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3).
+ // Boosting step 124 / 16 = 7.75 -> 7
+ //
+ const uint64_t kCurrentTime = 1000000;
+ FileMetaData meta;
+
+ {
+ FileTtlBooster booster(kCurrentTime, 2048, 7, 3);
+
+ // Not triggering if the file is younger than ttl/2
+ meta.oldest_ancester_time = kCurrentTime - 1023;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - 1024;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime + 10;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+ // Within one boosting step
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+ // One boosting step
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7);
+ ASSERT_EQ(2, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8);
+ ASSERT_EQ(2, booster.GetBoostScore(&meta));
+
+ // Multiple boosting steps
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30);
+ ASSERT_EQ(5, booster.GetBoostScore(&meta));
+
+ // Very high boosting steps
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700);
+ ASSERT_EQ(101, booster.GetBoostScore(&meta));
+ }
+ {
+ // Test second last level
+ FileTtlBooster booster(kCurrentTime, 2048, 7, 5);
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+ ASSERT_EQ(3, booster.GetBoostScore(&meta));
+ }
+ {
+ // Test last level
+ FileTtlBooster booster(kCurrentTime, 2048, 7, 6);
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ meta.oldest_ancester_time = kCurrentTime - 3000;
+ ASSERT_EQ(1, booster.GetBoostScore(&meta));
+ }
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // No compaction should be scheduled, if L0 has higher priority than L1
+ // but L0->L1 compaction is blocked by a file in L1 being compacted.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+ UpdateVersionStorageInfo(); // being_compacted flag is cleared here.
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+ // 6 L0 files, score 3.
+ Add(0, 1U, "000", "400", 1U);
+ Add(0, 2U, "001", "400", 1U, 0, 0);
+ Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+ Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+ // L1 score more than 6.
+ Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+ file_map_[4u].first->being_compacted = true;
+ Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+ Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+ // Output level overlaps with the beginning and the end of the chain
+ Add(2, 6U, "050", "100", 1U);
+ Add(2, 7U, "300", "400", 1U);
+
+ // If score in L1 is larger than L0, L1 compaction goes through despite
+ // there is pending L0 compaction.
+ UpdateVersionStorageInfo();
+ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 3U, "150", "200", 200);
+ // Level 1 is over target by 200
+ Add(1, 4U, "400", "500", 600);
+ Add(1, 5U, "600", "700", 600);
+ // Level 2 is less than target 10000 even added size of level 1
+ // Size ratio of L2/L1 is 9600 / 1200 = 8
+ Add(2, 6U, "150", "200", 2500);
+ Add(2, 7U, "201", "210", 2000);
+ Add(2, 8U, "300", "310", 2600);
+ Add(2, 9U, "400", "500", 2500);
+ // Level 3 exceeds target 100,000 of 1000
+ Add(3, 10U, "400", "500", 101000);
+ // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+ // Size ratio L4/L3 is 9.9
+ // After merge from L3, L4 size is 1000900
+ Add(4, 11U, "400", "500", 999900);
+ Add(5, 12U, "400", "500", 8007200);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // Level 1 size will be 1400 after merging with L0
+ Add(1, 7U, "400", "500", 200);
+ Add(1, 8U, "600", "700", 200);
+ // Level 2 is less than target 10000 even added size of level 1
+ Add(2, 9U, "150", "200", 9100);
+ // Level 3 over the target, but since level 4 is empty, we assume it will be
+ // a trivial move.
+ Add(3, 10U, "400", "500", 101000);
+
+ UpdateVersionStorageInfo();
+
+ // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+ ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+ Add(0, 1U, "150", "200", 2000);
+ Add(0, 2U, "150", "200", 2000);
+ Add(0, 4U, "150", "200", 2000);
+ Add(0, 5U, "150", "200", 2000);
+ Add(0, 6U, "150", "200", 1000);
+ // Level 1 size will be 10000 after merging with L0
+ Add(1, 7U, "400", "500", 500);
+ Add(1, 8U, "600", "700", 500);
+
+ Add(2, 9U, "150", "200", 10000);
+
+ UpdateVersionStorageInfo();
+
+ ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+ int num_levels = ioptions_.num_levels;
+ ioptions_.level_compaction_dynamic_level_bytes = true;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_bytes_for_level_base = 1000;
+ mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ // Set Last level size 50000
+ // num_levels - 1 target 5000
+ // num_levels - 2 is base level with target 1000 (rounded up to
+ // max_bytes_for_level_base).
+ Add(num_levels - 1, 10U, "400", "500", 50000);
+
+ Add(0, 1U, "150", "200", 200);
+ Add(0, 2U, "150", "200", 200);
+ Add(0, 4U, "150", "200", 200);
+ Add(0, 5U, "150", "200", 200);
+ Add(0, 6U, "150", "200", 200);
+ // num_levels - 3 is over target by 100 + 1000
+ Add(num_levels - 3, 7U, "400", "500", 550);
+ Add(num_levels - 3, 8U, "600", "700", 550);
+ // num_levels - 2 is over target by 1100 + 200
+ Add(num_levels - 2, 9U, "150", "200", 5200);
+
+ UpdateVersionStorageInfo();
+
+ // Merging to the second last level: (5200 / 2100 + 1) * 1100
+ // Merging to the last level: (50000 / 6300 + 1) * 1300
+ ASSERT_EQ(2100u + 3823u + 11617u,
+ vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+ // case 1: Higher levels are empty
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ bool result =
+ Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 2: Higher levels have no overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "k", "p");
+ Add(3, 8U, "t", "w");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // case 3.1: Higher levels (level 3) have overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "e", "g");
+ Add(3, 8U, "h", "k");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.2: Higher levels (level 5) have overlap
+ DeleteVersionStorage();
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "h", "k");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+ // one key ("d")
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "a", "m");
+ Add(0, 2U, "c", "z");
+ Add(1, 3U, "d", "e");
+ Add(1, 4U, "l", "p");
+ Add(2, 5U, "g", "i");
+ Add(2, 6U, "x", "z");
+ Add(3, 7U, "j", "k");
+ Add(3, 8U, "l", "m");
+ Add(4, 9U, "a", "b");
+ Add(5, 10U, "c", "cc");
+ Add(5, 11U, "ccc", "d");
+ Add(5, 12U, "y", "yy");
+ Add(5, 13U, "z", "zz");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 1);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(5U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "z");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ // Level 0 files don't overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(1, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_TRUE(result);
+
+ // Level 1 files overlap
+ NewVersionStorage(6, kCompactionStyleLevel);
+ Add(0, 1U, "s", "t");
+ Add(0, 2U, "a", "m");
+ Add(0, 3U, "b", "k");
+ Add(0, 4U, "e", "f");
+ Add(1, 5U, "a", "m");
+ Add(1, 6U, "n", "o");
+ Add(1, 7U, "w", "y");
+ Add(5, 10U, "y", "z");
+ UpdateVersionStorageInfo();
+ SetCompactionInputFilesLevels(2, 0);
+ AddToCompactionFiles(1U);
+ AddToCompactionFiles(2U);
+ AddToCompactionFiles(3U);
+ AddToCompactionFiles(4U);
+ AddToCompactionFiles(5U);
+ AddToCompactionFiles(6U);
+ AddToCompactionFiles(7U);
+ result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+ ASSERT_FALSE(result);
+
+ DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 800000u;
+ mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // It can expand because adding file 1 and 3, the compaction size will
+ // exceed mutable_cf_options_.max_bytes_for_level_base.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "100", "256", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+ mutable_cf_options_.max_bytes_for_level_base = 800000u;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2 and 5.
+ // and it expands to file 1 and 3 too.
+ Add(1, 1U, "100", "150", 300000U);
+ Add(1, 2U, "151", "200", 300001U, 0, 0);
+ Add(1, 3U, "201", "250", 300000U, 0, 0);
+ Add(1, 4U, "251", "300", 300000U, 0, 0);
+ Add(2, 5U, "000", "251", 1U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000u;
+ mutable_cf_options_.max_compaction_bytes = 10001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "100", "150", 3000U);
+ Add(1, 2U, "151", "200", 3001U);
+ Add(1, 3U, "201", "250", 3000U);
+ Add(1, 4U, "251", "300", 3000U);
+
+ Add(3, 5U, "120", "130", 7000U);
+ Add(3, 6U, "170", "180", 7000U);
+ Add(3, 7U, "220", "230", 7000U);
+ Add(3, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMove1) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_compaction_bytes = 10000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+ Add(0, 2U, "151", "200", 3001U, 0, 610, 700);
+ Add(0, 3U, "301", "350", 3000U, 0, 510, 600);
+ Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+ Add(1, 5U, "120", "130", 7000U);
+ Add(1, 6U, "170", "180", 7000U);
+ Add(1, 7U, "220", "230", 7000U);
+ Add(1, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_compaction_bytes = 10000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+ Add(0, 2U, "551", "600", 3001U, 0, 610, 700);
+ Add(0, 3U, "101", "150", 3000U, 0, 510, 600);
+ Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+ Add(1, 5U, "120", "130", 7000U);
+ Add(1, 6U, "170", "180", 7000U);
+ Add(1, 7U, "220", "230", 7000U);
+ Add(1, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(1, compaction->num_input_files(0));
+ ASSERT_EQ(4, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+ mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+ mutable_cf_options_.max_compaction_bytes = 10000000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(0, 1U, "300", "350", 3000U, 0, 710, 800);
+ Add(0, 2U, "651", "600", 3001U, 0, 610, 700);
+ Add(0, 3U, "501", "550", 3000U, 0, 510, 600);
+ Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+ Add(1, 5U, "120", "130", 7000U);
+ Add(1, 6U, "970", "980", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(4, compaction->num_input_files(0));
+ ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(2, compaction->input(0, 3)->fd.GetNumber());
+ ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
+ mutable_cf_options_.max_bytes_for_level_base = 10000u;
+ mutable_cf_options_.max_compaction_bytes = 10001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1);
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "100", "150", 3000U);
+ Add(1, 2U, "151", "200", 3001U);
+ Add(1, 3U, "201", "250", 3000U);
+ Add(1, 4U, "251", "300", 3000U);
+
+ Add(3, 5U, "120", "130", 7000U);
+ Add(3, 6U, "170", "180", 7000U);
+ Add(3, 7U, "220", "230", 7000U);
+ Add(3, 8U, "270", "280", 7000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ // No trivial move, because partitioning is applied
+ ASSERT_TRUE(!compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+ mutable_cf_options_.max_compaction_bytes = 10000u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ NewVersionStorage(6, kCompactionStyleLevel);
+ // A compaction should be triggered and pick all files from level 1
+ Add(1, 1U, "100", "150", 300000U, 0, 0);
+ Add(1, 2U, "150", "200", 300000U, 0, 0);
+ Add(1, 3U, "200", "250", 300000U, 0, 0);
+ Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+ Add(3, 5U, "120", "130", 6000U);
+ Add(3, 6U, "140", "150", 6000U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "200", 3001U);
+ Add(2, 3U, "301", "350", 3000U);
+ Add(2, 4U, "451", "400", 3000U);
+ Add(2, 5U, "551", "500", 3000U);
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 16U, "170", "180", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(4, compaction->num_input_files(0));
+ ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+ ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber());
+ ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "160", 3001U);
+ Add(2, 3U, "161", "179", 3000U);
+ Add(2, 4U, "220", "400", 3000U);
+ Add(2, 5U, "551", "500", 3000U);
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // Even if consecutive files can be trivial moved, we don't pick them
+ // since in case trivial move can't be issued for a reason, we cannot
+ // fall back to normal compactions.
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "160", 3001U);
+ Add(2, 5U, "551", "500", 3000U);
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(1, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 4000U);
+ Add(2, 2U, "151", "160", 4001U);
+ Add(2, 3U, "161", "179", 4000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // File 4 and 5 aren't clean cut, so only 2 and 3 are picked.
+ Add(2, 1U, "100", "150", 4000U);
+ Add(2, 2U, "151", "160", 4001U);
+ Add(2, 3U, "161", "179", 4000U);
+ Add(2, 4U, "180", "185", 4000U);
+ Add(2, 5U, "185", "190", 4000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) {
+ mutable_cf_options_.max_bytes_for_level_base = 1000u;
+ mutable_cf_options_.max_compaction_bytes = 10000001u;
+ ioptions_.level_compaction_dynamic_level_bytes = false;
+ ioptions_.compaction_pri = kMinOverlappingRatio;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ Add(2, 1U, "100", "150", 3000U);
+ Add(2, 2U, "151", "200", 3001U);
+ Add(2, 3U, "301", "350", 3000U);
+ Add(2, 4U, "451", "400", 3000U);
+ Add(2, 5U, "551", "500", 3000U);
+ file_map_[5U].first->being_compacted = true;
+ Add(2, 6U, "651", "600", 3000U);
+ Add(2, 7U, "751", "700", 3000U);
+ Add(2, 8U, "851", "900", 3000U);
+
+ Add(3, 15U, "120", "130", 700U);
+ Add(3, 16U, "170", "180", 700U);
+ Add(3, 17U, "220", "230", 700U);
+ Add(3, 18U, "870", "880", 700U);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_TRUE(compaction->IsTrivialMove());
+ ASSERT_EQ(1, compaction->num_input_levels());
+ // Since the next file is being compacted. Stopping at 3 and 4.
+ ASSERT_EQ(2, compaction->num_input_files(0));
+ ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+ NewVersionStorage(6, kCompactionStyleLevel);
+ mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+ Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+ "149" /* largest */, 1000000000U /* file_size */);
+ file_map_[1U].first->being_compacted = true;
+ Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 900000000U /* file_size */);
+ Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+ "249" /* largest */, 800000000U /* file_size */);
+ Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+ "299" /* largest */, 700000000U /* file_size */);
+ Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+ "199" /* largest */, 100U /* file_size */);
+ Add(2 /* level */, 6U /* file_number */, "200" /* smallest */,
+ "240" /* largest */, 1U /* file_size */);
+ Add(2 /* level */, 7U /* file_number */, "260" /* smallest */,
+ "270" /* largest */, 1U /* file_size */);
+ file_map_[5U].first->being_compacted = true;
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(2U, compaction->num_input_levels());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+ ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+ compaction.reset(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() == nullptr);
+ ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 1000000u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+ // spans entire L0 key range and is marked as being compacted to avoid
+ // L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(5U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+ // max_compaction_bytes limit (the minimum number of files for triggering
+ // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+ // is marked as being compacted to avoid L0->L1 compaction.
+ Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+ Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+ Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+ Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+ Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+ Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+ // Intra L0 compaction triggers only if there are at least
+ // level0_file_num_compaction_trigger + 2 L0 files.
+ mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+ mutable_cf_options_.max_compaction_bytes = 999999u;
+ NewVersionStorage(6, kCompactionStyleLevel);
+
+ // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+ // being_compact limit. And the latest one L0 will be skipped due to earliest
+ // seqno. The one L1 file spans entire L0 key range and is marked as being
+ // compacted to avoid L0->L1 compaction.
+ Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+ Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+ Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+ Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+ Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+ Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+ Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+ vstorage_->LevelFiles(0)[5]->being_compacted = true;
+ vstorage_->LevelFiles(1)[0]->being_compacted = true;
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_, 107));
+ ASSERT_TRUE(compaction.get() != nullptr);
+ ASSERT_EQ(1U, compaction->num_input_levels());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a "regular" universal compaction is
+ // scheduled first, followed by a delete triggered compaction. The latter
+ // should fail
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+ Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300);
+ Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251);
+ Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a compaction to reduce sorted runs
+ ASSERT_EQ(CompactionReason::kUniversalSortedRunNum,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+
+ AddVersionStorage();
+ // Simulate a flush and mark the file for compaction
+ Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a delete triggered compaction is
+ // scheduled first, followed by a "regular" compaction. The latter
+ // should fail
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ // Mark file number 4 for compaction
+ Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+ Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250);
+ Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+ Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(3, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+
+ AddVersionStorage();
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
+ // The case where universal periodic compaction can be picked
+ // with some newer files being compacted.
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+
+ bool input_level_overlap = false;
+ bool output_level_overlap = false;
+ // Let's mark 2 files in 2 different levels for compaction. The
+ // compaction picker will randomly pick one, so use the sync point to
+ // ensure a deterministic order. Loop until both cases are covered
+ size_t random_index = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) {
+ size_t* index = static_cast<size_t*>(arg);
+ *index = random_index;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ while (!input_level_overlap || !output_level_overlap) {
+ // Ensure that the L0 file gets picked first
+ random_index = !input_level_overlap ? 0 : 1;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(5, kCompactionStyleUniversal);
+
+ Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+ Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248);
+ Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249);
+ Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250);
+ Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true);
+ Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150);
+ Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_TRUE(compaction->start_level() == 0 ||
+ compaction->start_level() == 3);
+ if (compaction->start_level() == 0) {
+ // The L0 file was picked. The next compaction will detect an
+ // overlap on its input level
+ input_level_overlap = true;
+ ASSERT_EQ(3, compaction->output_level());
+ ASSERT_EQ(1U, compaction->num_input_files(0));
+ ASSERT_EQ(3U, compaction->num_input_files(1));
+ } else {
+ // The level 3 file was picked. The next compaction will pick
+ // the L0 file and will detect overlap when adding output
+ // level inputs
+ output_level_overlap = true;
+ ASSERT_EQ(4, compaction->output_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_EQ(1U, compaction->num_input_files(1));
+ }
+
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+ // After recomputing the compaction score, only one marked file will remain
+ random_index = 0;
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_FALSE(compaction2);
+ DeleteVersionStorage();
+ }
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a delete triggered compaction is
+ // scheduled and should result in a full compaction
+ NewVersionStorage(1, kCompactionStyleUniversal);
+
+ // Mark file number 4 for compaction
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(4U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+ ASSERT_TRUE(file_map_[3].first->being_compacted);
+ ASSERT_TRUE(file_map_[6].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a file is being compacted, and a
+ // delete triggered compaction is then scheduled. The latter should stop
+ // at the first file being compacted
+ NewVersionStorage(1, kCompactionStyleUniversal);
+
+ // Mark file number 4 for compaction
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ UpdateVersionStorageInfo();
+ file_map_[3].first->being_compacted = true;
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(2U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
+ const uint64_t kFileSize = 100000;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ // This test covers the case where a delete triggered compaction is
+ // scheduled first, followed by a "regular" compaction. The latter
+ // should fail
+ NewVersionStorage(1, kCompactionStyleUniversal);
+
+ // Mark file number 5 for compaction
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ ASSERT_TRUE(compaction);
+ // Validate that its a delete triggered compaction
+ ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(0, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+ ASSERT_TRUE(file_map_[3].first->being_compacted);
+ ASSERT_TRUE(file_map_[6].first->being_compacted);
+
+ AddVersionStorage();
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction2(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ ASSERT_TRUE(compaction2);
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[1].first->being_compacted);
+ ASSERT_TRUE(file_map_[2].first->being_compacted);
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+
+ // This test makes sure the `files_marked_for_compaction_` is updated after
+ // creating manual compaction.
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+
+ // Add 3 files marked for compaction
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+ UpdateVersionStorageInfo();
+
+ // All 3 files are marked for compaction
+ ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size());
+
+ bool manual_conflict = false;
+ InternalKey* manual_end = nullptr;
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.CompactRange(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
+ nullptr, nullptr, &manual_end, &manual_conflict,
+ std::numeric_limits<uint64_t>::max(), ""));
+
+ ASSERT_TRUE(compaction);
+
+ ASSERT_EQ(CompactionReason::kManualCompaction,
+ compaction->compaction_reason());
+ ASSERT_EQ(kNumLevels - 1, compaction->output_level());
+ ASSERT_EQ(0, compaction->start_level());
+ ASSERT_EQ(3U, compaction->num_input_files(0));
+ ASSERT_TRUE(file_map_[3].first->being_compacted);
+ ASSERT_TRUE(file_map_[4].first->being_compacted);
+ ASSERT_TRUE(file_map_[5].first->being_compacted);
+
+ // After creating the manual compaction, all files should be cleared from
+ // `FilesMarkedForCompaction`. So they won't be picked by others.
+ ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
+ // This test make sure size amplification compaction could still be triggered
+ // if the last sorted run is not the last level.
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 1 * kFileSize);
+ Add(0, 101U, "200", "400", 1 * kFileSize);
+ Add(4, 90U, "100", "600", 4 * kFileSize);
+ Add(5, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // Make sure it's a size amp compaction and includes all files
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeAmplification);
+ ASSERT_EQ(compaction->output_level(), kLastLevel);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+ ASSERT_EQ(compaction->input_levels(4)->num_files, 1);
+ ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
+ // This test makes sure the size amp calculation skips the last level (L6), so
+ // size amp compaction is not triggered, instead a size ratio compaction is
+ // triggered.
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+ const int kPenultimateLevel = kLastLevel - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 1 * kFileSize);
+ Add(0, 101U, "200", "400", 1 * kFileSize);
+ Add(5, 90U, "100", "600", 4 * kFileSize);
+ Add(6, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // Internally, size amp compaction is evaluated before size ratio compaction.
+ // Here to make sure it's size ratio compaction instead of size amp
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeRatio);
+ ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+ ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
+ ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
+ // Tiered compaction only support level_num > 2 (otherwise the penultimate
+ // level is going to be level 0, which may make thing more complicated), so
+ // when there's only 2 level, still treating level 1 as the last level for
+ // size amp compaction
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 2;
+ const int kLastLevel = kNumLevels - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 1 * kFileSize);
+ Add(0, 101U, "200", "400", 1 * kFileSize);
+ Add(0, 90U, "100", "600", 4 * kFileSize);
+ Add(1, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // size amp compaction is still triggered even preclude_last_level is set
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeAmplification);
+ ASSERT_EQ(compaction->output_level(), kLastLevel);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 3);
+ ASSERT_EQ(compaction->input_levels(1)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
+ // This test makes sure the size amp compaction for tiered storage could still
+ // be triggered, but only for non-last-level files
+ const uint64_t kFileSize = 100000;
+ const int kNumLevels = 7;
+ const int kLastLevel = kNumLevels - 1;
+ const int kPenultimateLevel = kLastLevel - 1;
+
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ ioptions_.preclude_last_level_data_seconds = 1000;
+ mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent = 200;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+ NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+ Add(0, 100U, "100", "300", 3 * kFileSize);
+ Add(0, 101U, "200", "400", 2 * kFileSize);
+ Add(5, 90U, "100", "600", 2 * kFileSize);
+ Add(6, 80U, "200", "300", 2 * kFileSize);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+
+ // It's a Size Amp compaction, but doesn't include the last level file and
+ // output to the penultimate level.
+ ASSERT_EQ(compaction->compaction_reason(),
+ CompactionReason::kUniversalSizeAmplification);
+ ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
+ ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+ ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+ ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerU64TsTest, Overlap) {
+ int num_levels = ioptions_.num_levels;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ constexpr int level = 0;
+ constexpr uint64_t file_number = 20ULL;
+ constexpr char smallest[4] = "500";
+ constexpr char largest[4] = "600";
+ constexpr uint64_t ts_of_smallest = 12345ULL;
+ constexpr uint64_t ts_of_largest = 56789ULL;
+
+ {
+ std::string ts1;
+ PutFixed64(&ts1, ts_of_smallest);
+ std::string ts2;
+ PutFixed64(&ts2, ts_of_largest);
+ Add(level, file_number, smallest, largest,
+ /*file_size=*/1U, /*path_id=*/0,
+ /*smallest_seq=*/100, /*largest_seq=*/100, /*compensated_file_size=*/0,
+ /*marked_for_compact=*/false, /*temperature=*/Temperature::kUnknown,
+ /*oldest_ancestor_time=*/kUnknownOldestAncesterTime, ts1, ts2);
+ UpdateVersionStorageInfo();
+ }
+
+ std::unordered_set<uint64_t> input{file_number};
+
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input, vstorage_.get(), CompactionOptions()));
+ std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+ CompactionOptions(), input_files, level, vstorage_.get(),
+ mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+
+ {
+ // [600, ts=50000] to [600, ts=50000] is the range to check.
+ // ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) > 0, but
+ // ucmp->CompareWithoutTimestamp(smallest_user_key,
+ // c->GetLargestUserKey()) == 0.
+ // Should still be considered overlapping.
+ std::string user_key_with_ts1(largest);
+ PutFixed64(&user_key_with_ts1, ts_of_largest - 1);
+ std::string user_key_with_ts2(largest);
+ PutFixed64(&user_key_with_ts2, ts_of_largest - 1);
+ ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+ user_key_with_ts1, user_key_with_ts2, level));
+ }
+ {
+ // [500, ts=60000] to [500, ts=60000] is the range to check.
+ // ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) < 0, but
+ // ucmp->CompareWithoutTimestamp(largest_user_key,
+ // c->GetSmallestUserKey()) == 0.
+ // Should still be considered overlapping.
+ std::string user_key_with_ts1(smallest);
+ PutFixed64(&user_key_with_ts1, ts_of_smallest + 1);
+ std::string user_key_with_ts2(smallest);
+ PutFixed64(&user_key_with_ts2, ts_of_smallest + 1);
+ ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+ user_key_with_ts1, user_key_with_ts2, level));
+ }
+}
+
+TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) {
+ constexpr uint64_t kFileSize = 100000;
+
+ mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+ mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+ NewVersionStorage(1, kCompactionStyleUniversal);
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ UpdateVersionStorageInfo();
+ // must return false when there's no files.
+ ASSERT_FALSE(universal_compaction_picker.NeedsCompaction(vstorage_.get()));
+
+ std::string ts1;
+ PutFixed64(&ts1, 9000);
+ std::string ts2;
+ PutFixed64(&ts2, 8000);
+ std::string ts3;
+ PutFixed64(&ts3, 7000);
+ std::string ts4;
+ PutFixed64(&ts4, 6000);
+
+ NewVersionStorage(3, kCompactionStyleUniversal);
+ // A compaction should be triggered and pick file 2
+ Add(1, 1U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+ /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+ /*marked_for_compact=*/false, Temperature::kUnknown,
+ kUnknownOldestAncesterTime, ts1, ts2);
+ Add(2, 2U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+ /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+ /*marked_for_compact=*/false, Temperature::kUnknown,
+ kUnknownOldestAncesterTime, ts3, ts4);
+ UpdateVersionStorageInfo();
+
+ std::unique_ptr<Compaction> compaction(
+ universal_compaction_picker.PickCompaction(
+ cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+ &log_buffer_));
+ assert(compaction);
+ ASSERT_TRUE(!compaction->is_trivial_move());
+}
+
+class PerKeyPlacementCompactionPickerTest
+ : public CompactionPickerTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {}
+
+ void SetUp() override { enable_per_key_placement_ = GetParam(); }
+
+ protected:
+ bool enable_per_key_placement_ = false;
+};
+
+TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(5, 40U, "200", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ level_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+ 0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(40);
+ input_set.insert(41);
+ ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ level_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ OverlapWithNormalCompactionUniveral) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(5, 40U, "200", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+ 0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement_;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ int num_levels = ioptions_.num_levels;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ Add(0, 21U, "100", "150", 60000000U);
+ Add(0, 22U, "300", "350", 60000000U);
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(6, 50U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(21);
+ input_set.insert(22);
+ input_set.insert(50);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(40);
+ input_set.insert(41);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
+ // This test is make sure the Tiered compaction would lock whole range of
+ // both output level and penultimate level
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ // the existing compaction is the 1st L4 file + L6 file
+ // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped
+ // when the tiered compaction feature is on.
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ input_set.insert(60);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(41);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+
+ // compacting the 3rd L4 file is always safe:
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(42);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(60);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ // cannot compact file 41 if the preclude_last_level feature is on, otherwise
+ // compact file 41 is okay.
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(41);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+
+ // compacting the 3rd L4 file is always safe:
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(42);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ LastLevelOnlyFailPenultimateUniversal) {
+ // This is to test last_level only compaction still unable to do the
+ // penultimate level compaction if there's already a file in the penultimate
+ // level.
+ // This should rarely happen in universal compaction, as the non-empty L5
+ // should be included in the compaction.
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220]
+ // L5: [230, 250]
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(5, 50U, "230", "250", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(60);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ ASSERT_TRUE(comp1);
+ ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+ // As comp1 cannot be output to the penultimate level, compacting file 40 to
+ // L5 is always safe.
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(40);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 5, Compaction::kInvalidLevel));
+
+ std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+ ASSERT_TRUE(comp2);
+ ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ LastLevelOnlyConflictWithOngoingUniversal) {
+ // This is to test last_level only compaction still unable to do the
+ // penultimate level compaction if there's already an ongoing compaction to
+ // the penultimate level
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ // create an ongoing compaction to L5 (penultimate level)
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(40);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ ASSERT_TRUE(comp1);
+ ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(60);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ ASSERT_EQ(enable_per_key_placement_,
+ universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+ 6, 6)));
+
+ if (!enable_per_key_placement_) {
+ std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+ ASSERT_TRUE(comp2);
+ ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+ }
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+ LastLevelOnlyNoConflictWithOngoingUniversal) {
+ // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only
+ // change is the ongoing compaction to L5 has no overlap with the last level
+ // compaction, so it's safe to move data from the last level to the
+ // penultimate level.
+ if (enable_per_key_placement_) {
+ ioptions_.preclude_last_level_data_seconds = 10000;
+ }
+
+ int num_levels = ioptions_.num_levels;
+ ioptions_.compaction_style = kCompactionStyleUniversal;
+ UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+ NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+ // L4: [200, 220] [230, 250] [360, 380]
+ // L5:
+ // L6: [101, 351]
+ Add(4, 40U, "200", "220", 60000000U);
+ Add(4, 41U, "230", "250", 60000000U);
+ Add(4, 42U, "360", "380", 60000000U);
+ Add(6, 60U, "101", "351", 60000000U);
+ UpdateVersionStorageInfo();
+
+ // create an ongoing compaction to L5 (penultimate level)
+ CompactionOptions comp_options;
+ std::unordered_set<uint64_t> input_set;
+ input_set.insert(42);
+ std::vector<CompactionInputFiles> input_files;
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+
+ ASSERT_TRUE(comp1);
+ ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+ input_set.clear();
+ input_files.clear();
+ input_set.insert(60);
+ ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, vstorage_.get(), comp_options));
+
+ // always safe to move data up
+ ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+ input_files, 6,
+ Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, 6, 6)));
+
+ // 2 compactions can be run in parallel
+ std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+ comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+ mutable_db_options_, 0));
+ ASSERT_TRUE(comp2);
+ if (enable_per_key_placement_) {
+ ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+ } else {
+ ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
+ PerKeyPlacementCompactionPickerTest, ::testing::Bool());
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..376e4f60f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1450 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+ UniversalCompactionBuilder(
+ const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ UniversalCompactionPicker* picker, LogBuffer* log_buffer)
+ : ioptions_(ioptions),
+ icmp_(icmp),
+ cf_name_(cf_name),
+ mutable_cf_options_(mutable_cf_options),
+ mutable_db_options_(mutable_db_options),
+ vstorage_(vstorage),
+ picker_(picker),
+ log_buffer_(log_buffer) {}
+
+ // Form and return the compaction object. The caller owns return object.
+ Compaction* PickCompaction();
+
+ private:
+ struct SortedRun {
+ SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+ uint64_t _compensated_file_size, bool _being_compacted)
+ : level(_level),
+ file(_file),
+ size(_size),
+ compensated_file_size(_compensated_file_size),
+ being_compacted(_being_compacted) {
+ assert(compensated_file_size > 0);
+ assert(level != 0 || file != nullptr);
+ }
+
+ void Dump(char* out_buf, size_t out_buf_size,
+ bool print_path = false) const;
+
+ // sorted_run_count is added into the string to print
+ void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+ size_t sorted_run_count) const;
+
+ int level;
+ // `file` Will be null for level > 0. For level = 0, the sorted run is
+ // for this file.
+ FileMetaData* file;
+ // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+ // files in the level. `being_compacted` should be the same for all files
+ // in a non-zero level. Use the value here.
+ uint64_t size;
+ uint64_t compensated_file_size;
+ bool being_compacted;
+ };
+
+ // Pick Universal compaction to limit read amplification
+ Compaction* PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+ // Pick Universal compaction to limit space amplification.
+ Compaction* PickCompactionToReduceSizeAmp();
+
+ // Try to pick incremental compaction to reduce space amplification.
+ // It will return null if it cannot find a fanout within the threshold.
+ // Fanout is defined as
+ // total size of files to compact at output level
+ // --------------------------------------------------
+ // total size of files to compact at other levels
+ Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold);
+
+ Compaction* PickDeleteTriggeredCompaction();
+
+ // Form a compaction from the sorted run indicated by start_index to the
+ // oldest sorted run.
+ // The caller is responsible for making sure that those files are not in
+ // compaction.
+ Compaction* PickCompactionToOldest(size_t start_index,
+ CompactionReason compaction_reason);
+
+ Compaction* PickCompactionWithSortedRunRange(
+ size_t start_index, size_t end_index, CompactionReason compaction_reason);
+
+ // Try to pick periodic compaction. The caller should only call it
+ // if there is at least one file marked for periodic compaction.
+ // null will be returned if no such a compaction can be formed
+ // because some files are being compacted.
+ Compaction* PickPeriodicCompaction();
+
+ // Used in universal compaction when the allow_trivial_move
+ // option is set. Checks whether there are any overlapping files
+ // in the input. Returns true if the input files are non
+ // overlapping.
+ bool IsInputFilesNonOverlapping(Compaction* c);
+
+ uint64_t GetMaxOverlappingBytes() const;
+
+ const ImmutableOptions& ioptions_;
+ const InternalKeyComparator* icmp_;
+ double score_;
+ std::vector<SortedRun> sorted_runs_;
+ const std::string& cf_name_;
+ const MutableCFOptions& mutable_cf_options_;
+ const MutableDBOptions& mutable_db_options_;
+ VersionStorageInfo* vstorage_;
+ UniversalCompactionPicker* picker_;
+ LogBuffer* log_buffer_;
+
+ static std::vector<SortedRun> CalculateSortedRuns(
+ const VersionStorageInfo& vstorage);
+
+ // Pick a path ID to place a newly generated file, with its estimated file
+ // size.
+ static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options,
+ uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+ InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+ FileMetaData* f;
+ size_t level;
+ size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+ explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+ bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+ return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(),
+ i2.f->smallest.user_key()) > 0);
+ }
+
+ private:
+ const Comparator* ucmp_;
+};
+
+using SmallestKeyHeap =
+ std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+ SmallestKeyHeapComparator>;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+ SmallestKeyHeap smallest_key_priority_q =
+ SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+ InputFileInfo input_file;
+
+ for (size_t l = 0; l < c->num_input_levels(); l++) {
+ if (c->num_input_files(l) != 0) {
+ if (l == 0 && c->start_level() == 0) {
+ for (size_t i = 0; i < c->num_input_files(0); i++) {
+ input_file.f = c->input(0, i);
+ input_file.level = 0;
+ input_file.index = i;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ } else {
+ input_file.f = c->input(l, 0);
+ input_file.level = l;
+ input_file.index = 0;
+ smallest_key_priority_q.push(std::move(input_file));
+ }
+ }
+ }
+ return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+ SequenceNumber* smallest_seqno,
+ SequenceNumber* largest_seqno) {
+ bool is_first = true;
+ for (FileMetaData* f : files) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ *smallest_seqno = f->fd.smallest_seqno;
+ *largest_seqno = f->fd.largest_seqno;
+ } else {
+ if (f->fd.smallest_seqno < *smallest_seqno) {
+ *smallest_seqno = f->fd.smallest_seqno;
+ }
+ if (f->fd.largest_seqno > *largest_seqno) {
+ *largest_seqno = f->fd.largest_seqno;
+ }
+ }
+ }
+}
+#endif
+} // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+ auto comparator = icmp_->user_comparator();
+ int first_iter = 1;
+
+ InputFileInfo prev, curr, next;
+
+ SmallestKeyHeap smallest_key_priority_q =
+ create_level_heap(c, icmp_->user_comparator());
+
+ while (!smallest_key_priority_q.empty()) {
+ curr = smallest_key_priority_q.top();
+ smallest_key_priority_q.pop();
+
+ if (first_iter) {
+ prev = curr;
+ first_iter = 0;
+ } else {
+ if (comparator->CompareWithoutTimestamp(
+ prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) {
+ // found overlapping files, return false
+ return false;
+ }
+ assert(comparator->CompareWithoutTimestamp(
+ curr.f->largest.user_key(), prev.f->largest.user_key()) > 0);
+ prev = curr;
+ }
+
+ next.f = nullptr;
+
+ if (c->level(curr.level) != 0 &&
+ curr.index < c->num_input_files(curr.level) - 1) {
+ next.f = c->input(curr.level, curr.index + 1);
+ next.level = curr.level;
+ next.index = curr.index + 1;
+ }
+
+ if (next.f) {
+ smallest_key_priority_q.push(std::move(next));
+ }
+ }
+ return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+ const VersionStorageInfo* vstorage) const {
+ const int kLevel0 = 0;
+ if (vstorage->CompactionScore(kLevel0) >= 1) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+ return true;
+ }
+ if (!vstorage->FilesMarkedForCompaction().empty()) {
+ return true;
+ }
+ return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) {
+ UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+ mutable_cf_options, mutable_db_options,
+ vstorage, this, log_buffer);
+ return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+ size_t out_buf_size,
+ bool print_path) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ if (file->fd.GetPathId() == 0 || !print_path) {
+ snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+ } else {
+ snprintf(out_buf, out_buf_size,
+ "file %" PRIu64
+ "(path "
+ "%" PRIu32 ")",
+ file->fd.GetNumber(), file->fd.GetPathId());
+ }
+ } else {
+ snprintf(out_buf, out_buf_size, "level %d", level);
+ }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+ char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+ if (level == 0) {
+ assert(file != nullptr);
+ snprintf(out_buf, out_buf_size,
+ "file %" PRIu64 "[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+ file->compensated_file_size);
+ } else {
+ snprintf(out_buf, out_buf_size,
+ "level %d[%" ROCKSDB_PRIszt
+ "] "
+ "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+ level, sorted_run_count, size, compensated_file_size);
+ }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+ const VersionStorageInfo& vstorage) {
+ std::vector<UniversalCompactionBuilder::SortedRun> ret;
+ for (FileMetaData* f : vstorage.LevelFiles(0)) {
+ ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+ f->being_compacted);
+ }
+ for (int level = 1; level < vstorage.num_levels(); level++) {
+ uint64_t total_compensated_size = 0U;
+ uint64_t total_size = 0U;
+ bool being_compacted = false;
+ for (FileMetaData* f : vstorage.LevelFiles(level)) {
+ total_compensated_size += f->compensated_file_size;
+ total_size += f->fd.GetFileSize();
+ // Size amp, read amp and periodic compactions always include all files
+ // for a non-zero level. However, a delete triggered compaction and
+ // a trivial move might pick a subset of files in a sorted run. So
+ // always check all files in a sorted run and mark the entire run as
+ // being compacted if one or more files are being compacted
+ if (f->being_compacted) {
+ being_compacted = f->being_compacted;
+ }
+ }
+ if (total_compensated_size > 0) {
+ ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+ being_compacted);
+ }
+ }
+ return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+ const int kLevel0 = 0;
+ score_ = vstorage_->CompactionScore(kLevel0);
+ sorted_runs_ = CalculateSortedRuns(*vstorage_);
+
+ if (sorted_runs_.size() == 0 ||
+ (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+ vstorage_->FilesMarkedForCompaction().empty() &&
+ sorted_runs_.size() < (unsigned int)mutable_cf_options_
+ .level0_file_num_compaction_trigger)) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+ cf_name_.c_str());
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER_MAX_SZ(
+ log_buffer_, 3072,
+ "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n",
+ cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+ Compaction* c = nullptr;
+ // Periodic compaction has higher priority than other type of compaction
+ // because it's a hard requirement.
+ if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+ // Always need to do a full compaction for periodic compaction.
+ c = PickPeriodicCompaction();
+ }
+
+ // Check for size amplification.
+ if (c == nullptr &&
+ sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger)) {
+ if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification is within limits. Try reducing read
+ // amplification while maintaining file size ratios.
+ unsigned int ratio =
+ mutable_cf_options_.compaction_options_universal.size_ratio;
+
+ if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for size ratio\n",
+ cf_name_.c_str());
+ } else {
+ // Size amplification and file size ratios are within configured limits.
+ // If max read amplification is exceeding configured limits, then force
+ // compaction without looking at filesize ratios and try to reduce
+ // the number of files to fewer than level0_file_num_compaction_trigger.
+ // This is guaranteed by NeedsCompaction()
+ assert(sorted_runs_.size() >=
+ static_cast<size_t>(
+ mutable_cf_options_.level0_file_num_compaction_trigger));
+ // Get the total number of sorted runs that are not being compacted
+ int num_sr_not_compacted = 0;
+ for (size_t i = 0; i < sorted_runs_.size(); i++) {
+ if (sorted_runs_[i].being_compacted == false) {
+ num_sr_not_compacted++;
+ }
+ }
+
+ // The number of sorted runs that are not being compacted is greater
+ // than the maximum allowed number of sorted runs
+ if (num_sr_not_compacted >
+ mutable_cf_options_.level0_file_num_compaction_trigger) {
+ unsigned int num_files =
+ num_sr_not_compacted -
+ mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+ if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+ nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: compacting for file num -- %u\n",
+ cf_name_.c_str(), num_files);
+ }
+ }
+ }
+ }
+ }
+
+ if (c == nullptr) {
+ if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: delete triggered compaction\n",
+ cf_name_.c_str());
+ }
+ }
+
+ if (c == nullptr) {
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+ return nullptr;
+ }
+
+ if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+ true &&
+ c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+ c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+ }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+ bool is_first = true;
+
+ size_t level_index = 0U;
+ if (c->start_level() == 0) {
+ for (auto f : *c->inputs(0)) {
+ assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+ if (is_first) {
+ is_first = false;
+ }
+ }
+ level_index = 1U;
+ }
+ for (; level_index < c->num_input_levels(); level_index++) {
+ if (c->num_input_files(level_index) != 0) {
+ SequenceNumber smallest_seqno = 0U;
+ SequenceNumber largest_seqno = 0U;
+ GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+ &largest_seqno);
+ if (is_first) {
+ is_first = false;
+ }
+ }
+ }
+#endif
+ // update statistics
+ size_t num_files = 0;
+ for (auto& each_level : *c->inputs()) {
+ num_files += each_level.files.size();
+ }
+ RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+ picker_->RegisterCompaction(c);
+ vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+ TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+ c);
+ return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+ // Two conditions need to be satisfied:
+ // (1) the target path needs to be able to hold the file's size
+ // (2) Total size left in this and previous paths need to be not
+ // smaller than expected future file size before this new file is
+ // compacted, which is estimated based on size_ratio.
+ // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+ // we will make sure the target file, probably with size of 16, will be
+ // placed in a path so that eventually when new files are generated and
+ // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+ // before the path we chose.
+ //
+ // TODO(sdong): now the case of multiple column families is not
+ // considered in this algorithm. So the target size can be violated in
+ // that case. We need to improve it.
+ uint64_t accumulated_size = 0;
+ uint64_t future_size =
+ file_size *
+ (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+ uint32_t p = 0;
+ assert(!ioptions.cf_paths.empty());
+ for (; p < ioptions.cf_paths.size() - 1; p++) {
+ uint64_t target_size = ioptions.cf_paths[p].target_size;
+ if (target_size > file_size &&
+ accumulated_size + (target_size - file_size) > future_size) {
+ return p;
+ }
+ accumulated_size += target_size;
+ }
+ return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+ unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+ unsigned int min_merge_width =
+ mutable_cf_options_.compaction_options_universal.min_merge_width;
+ unsigned int max_merge_width =
+ mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+ const SortedRun* sr = nullptr;
+ bool done = false;
+ size_t start_index = 0;
+ unsigned int candidate_count = 0;
+
+ unsigned int max_files_to_compact =
+ std::min(max_merge_width, max_number_of_files_to_compact);
+ min_merge_width = std::max(min_merge_width, 2U);
+
+ // Caller checks the size before executing this function. This invariant is
+ // important because otherwise we may have a possible integer underflow when
+ // dealing with unsigned types.
+ assert(sorted_runs_.size() > 0);
+
+ // Considers a candidate file only if it is smaller than the
+ // total size accumulated so far.
+ for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+ candidate_count = 0;
+
+ // Skip files that are already being compacted
+ for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+ sr = &sorted_runs_[loop];
+
+ if (!sr->being_compacted) {
+ candidate_count = 1;
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf));
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: %s"
+ "[%d] being compacted, skipping",
+ cf_name_.c_str(), file_num_buf, loop);
+
+ sr = nullptr;
+ }
+
+ // This file is not being compacted. Consider it as the
+ // first candidate to be compacted.
+ uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+ if (sr != nullptr) {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Possible candidate %s[%d].",
+ cf_name_.c_str(), file_num_buf, loop);
+ }
+
+ // Check if the succeeding files need compaction.
+ for (size_t i = loop + 1;
+ candidate_count < max_files_to_compact && i < sorted_runs_.size();
+ i++) {
+ const SortedRun* succeeding_sr = &sorted_runs_[i];
+ if (succeeding_sr->being_compacted) {
+ break;
+ }
+ // Pick files if the total/last candidate file size (increased by the
+ // specified ratio) is still larger than the next candidate file.
+ // candidate_size is the total size of files picked so far with the
+ // default kCompactionStopStyleTotalSize; with
+ // kCompactionStopStyleSimilarSize, it's simply the size of the last
+ // picked file.
+ double sz = candidate_size * (100.0 + ratio) / 100.0;
+ if (sz < static_cast<double>(succeeding_sr->size)) {
+ break;
+ }
+ if (mutable_cf_options_.compaction_options_universal.stop_style ==
+ kCompactionStopStyleSimilarSize) {
+ // Similar-size stopping rule: also check the last picked file isn't
+ // far larger than the next candidate file.
+ sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+ if (sz < static_cast<double>(candidate_size)) {
+ // If the small file we've encountered begins a run of similar-size
+ // files, we'll pick them up on a future iteration of the outer
+ // loop. If it's some lonely straggler, it'll eventually get picked
+ // by the last-resort read amp strategy which disregards size ratios.
+ break;
+ }
+ candidate_size = succeeding_sr->compensated_file_size;
+ } else { // default kCompactionStopStyleTotalSize
+ candidate_size += succeeding_sr->compensated_file_size;
+ }
+ candidate_count++;
+ }
+
+ // Found a series of consecutive files that need compaction.
+ if (candidate_count >= (unsigned int)min_merge_width) {
+ start_index = loop;
+ done = true;
+ break;
+ } else {
+ for (size_t i = loop;
+ i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+ const SortedRun* skipping_sr = &sorted_runs_[i];
+ char file_num_buf[256];
+ skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+ }
+ }
+ if (!done || candidate_count <= 1) {
+ return nullptr;
+ }
+ size_t first_index_after = start_index + candidate_count;
+ // Compression is enabled if files compacted earlier already reached
+ // size ratio of compression.
+ bool enable_compression = true;
+ int ratio_to_compress =
+ mutable_cf_options_.compaction_options_universal.compression_size_percent;
+ if (ratio_to_compress >= 0) {
+ uint64_t total_size = 0;
+ for (auto& sorted_run : sorted_runs_) {
+ total_size += sorted_run.compensated_file_size;
+ }
+
+ uint64_t older_file_size = 0;
+ for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+ older_file_size += sorted_runs_[i].size;
+ if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+ enable_compression = false;
+ break;
+ }
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ for (unsigned int i = 0; i < first_index_after; i++) {
+ estimated_total_size += sorted_runs_[i].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+ int output_level;
+ if (first_index_after == sorted_runs_.size()) {
+ output_level = vstorage_->num_levels() - 1;
+ } else if (sorted_runs_[first_index_after].level == 0) {
+ output_level = 0;
+ } else {
+ output_level = sorted_runs_[first_index_after].level - 1;
+ }
+
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind &&
+ (output_level == vstorage_->num_levels() - 1)) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t i = start_index; i < first_index_after; i++) {
+ auto& picking_sr = sorted_runs_[i];
+ if (picking_sr.level == 0) {
+ FileMetaData* picking_file = picking_sr.file;
+ inputs[0].files.push_back(picking_file);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+ cf_name_.c_str(), file_num_buf);
+ }
+
+ std::vector<FileMetaData*> grandparents;
+ // Include grandparents for potential file cutting in incremental
+ // mode. It is for aligning file cutting boundaries across levels,
+ // so that subsequent compactions can pick files with aligned
+ // buffer.
+ // Single files are only picked up in incremental mode, so that
+ // there is no need for full range.
+ if (mutable_cf_options_.compaction_options_universal.incremental &&
+ first_index_after < sorted_runs_.size() &&
+ sorted_runs_[first_index_after].level > 1) {
+ grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
+ }
+
+ if (output_level != 0 &&
+ picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+ start_level, output_level))) {
+ return nullptr;
+ }
+ CompactionReason compaction_reason;
+ if (max_number_of_files_to_compact == UINT_MAX) {
+ compaction_reason = CompactionReason::kUniversalSizeRatio;
+ } else {
+ compaction_reason = CompactionReason::kUniversalSortedRunNum;
+ }
+ return new Compaction(vstorage_, ioptions_, mutable_cf_options_,
+ mutable_db_options_, std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_,
+ output_level, 1, enable_compression),
+ GetCompressionOptions(mutable_cf_options_, vstorage_,
+ output_level, enable_compression),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, grandparents,
+ /* is manual */ false, /* trim_ts */ "", score_,
+ false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+ // percentage flexibility while reducing size amplification
+ uint64_t ratio = mutable_cf_options_.compaction_options_universal
+ .max_size_amplification_percent;
+
+ unsigned int candidate_count = 0;
+ uint64_t candidate_size = 0;
+ size_t start_index = 0;
+ const SortedRun* sr = nullptr;
+
+ assert(!sorted_runs_.empty());
+ if (sorted_runs_.back().being_compacted) {
+ return nullptr;
+ }
+
+ // Skip files that are already being compacted
+ for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+ sr = &sorted_runs_[loop];
+ if (!sr->being_compacted) {
+ start_index = loop; // Consider this as the first candidate.
+ break;
+ }
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: skipping %s[%d] compacted %s",
+ cf_name_.c_str(), file_num_buf, loop,
+ " cannot be a candidate to reduce size amp.\n");
+ sr = nullptr;
+ }
+
+ if (sr == nullptr) {
+ return nullptr; // no candidate files
+ }
+ {
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+ cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+ }
+
+ // size of the base sorted run for size amp calculation
+ uint64_t base_sr_size = sorted_runs_.back().size;
+ size_t sr_end_idx = sorted_runs_.size() - 1;
+ // If tiered compaction is enabled and the last sorted run is the last level
+ if (ioptions_.preclude_last_level_data_seconds > 0 &&
+ ioptions_.num_levels > 2 &&
+ sorted_runs_.back().level == ioptions_.num_levels - 1 &&
+ sorted_runs_.size() > 1) {
+ sr_end_idx = sorted_runs_.size() - 2;
+ base_sr_size = sorted_runs_[sr_end_idx].size;
+ }
+
+ // keep adding up all the remaining files
+ for (size_t loop = start_index; loop < sr_end_idx; loop++) {
+ sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ // TODO with incremental compaction is supported, we might want to
+ // schedule some incremental compactions in parallel if needed.
+ char file_num_buf[kFormatFileNumberBufSize];
+ sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+ ROCKS_LOG_BUFFER(
+ log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+ cf_name_.c_str(), file_num_buf, start_index,
+ " is already being compacted. No size amp reduction possible.\n");
+ return nullptr;
+ }
+ candidate_size += sr->compensated_file_size;
+ candidate_count++;
+ }
+ if (candidate_count == 0) {
+ return nullptr;
+ }
+
+ // size amplification = percentage of additional size
+ if (candidate_size * 100 < ratio * base_sr_size) {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, base_sr_size);
+ return nullptr;
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer_,
+ "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+ " earliest-file-size %" PRIu64,
+ cf_name_.c_str(), candidate_size, base_sr_size);
+ }
+ // Since incremental compaction can't include more than second last
+ // level, it can introduce penalty, compared to full compaction. We
+ // hard code the pentalty to be 80%. If we end up with a compaction
+ // fanout higher than 80% of full level compactions, we fall back
+ // to full level compaction.
+ // The 80% threshold is arbitrary and can be adjusted or made
+ // configurable in the future.
+ // This also prevent the case when compaction falls behind and we
+ // need to compact more levels for compactions to catch up.
+ if (mutable_cf_options_.compaction_options_universal.incremental) {
+ double fanout_threshold = static_cast<double>(base_sr_size) /
+ static_cast<double>(candidate_size) * 1.8;
+ Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold);
+ if (picked != nullptr) {
+ // As the feature is still incremental, picking incremental compaction
+ // might fail and we will fall bck to compacting full level.
+ return picked;
+ }
+ }
+ return PickCompactionWithSortedRunRange(
+ start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification);
+}
+
+Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
+ double fanout_threshold) {
+ // Try find all potential compactions with total size just over
+ // options.max_compaction_size / 2, and take the one with the lowest
+ // fanout (defined in declaration of the function).
+ // This is done by having a sliding window of the files at the second
+ // lowest level, and keep expanding while finding overlapping in the
+ // last level. Once total size exceeds the size threshold, calculate
+ // the fanout value. And then shrinking from the small side of the
+ // window. Keep doing it until the end.
+ // Finally, we try to include upper level files if they fall into
+ // the range.
+ //
+ // Note that it is a similar problem as leveled compaction's
+ // kMinOverlappingRatio priority, but instead of picking single files
+ // we expand to a target compaction size. The reason is that in
+ // leveled compaction, actual fanout value tends to high, e.g. 10, so
+ // even with single file in down merging level, the extra size
+ // compacted in boundary files is at a lower ratio. But here users
+ // often have size of second last level size to be 1/4, 1/3 or even
+ // 1/2 of the bottommost level, so picking single file in second most
+ // level will cause significant waste, which is not desirable.
+ //
+ // This algorithm has lots of room to improve to pick more efficient
+ // compactions.
+ assert(sorted_runs_.size() >= 2);
+ int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level;
+ if (second_last_level == 0) {
+ // Can't split Level 0.
+ return nullptr;
+ }
+ int output_level = sorted_runs_.back().level;
+ const std::vector<FileMetaData*>& bottom_files =
+ vstorage_->LevelFiles(output_level);
+ const std::vector<FileMetaData*>& files =
+ vstorage_->LevelFiles(second_last_level);
+ assert(!bottom_files.empty());
+ assert(!files.empty());
+
+ // std::unordered_map<uint64_t, uint64_t> file_to_order;
+
+ int picked_start_idx = 0;
+ int picked_end_idx = 0;
+ double picked_fanout = fanout_threshold;
+
+ // Use half target compaction bytes as anchor to stop growing second most
+ // level files, and reserve growing space for more overlapping bottom level,
+ // clean cut, files from other levels, etc.
+ uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2;
+ int start_idx = 0;
+ int bottom_end_idx = 0;
+ int bottom_start_idx = 0;
+ uint64_t non_bottom_size = 0;
+ uint64_t bottom_size = 0;
+ bool end_bottom_size_counted = false;
+ for (int end_idx = 0; end_idx < static_cast<int>(files.size()); end_idx++) {
+ FileMetaData* end_file = files[end_idx];
+
+ // Include bottom most level files smaller than the current second
+ // last level file.
+ int num_skipped = 0;
+ while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+ icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+ end_file->smallest) < 0) {
+ if (!end_bottom_size_counted) {
+ bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+ }
+ bottom_end_idx++;
+ end_bottom_size_counted = false;
+ num_skipped++;
+ }
+
+ if (num_skipped > 1) {
+ // At least a file in the bottom most level falls into the file gap. No
+ // reason to include the file. We cut the range and start a new sliding
+ // window.
+ start_idx = end_idx;
+ }
+
+ if (start_idx == end_idx) {
+ // new sliding window.
+ non_bottom_size = 0;
+ bottom_size = 0;
+ bottom_start_idx = bottom_end_idx;
+ end_bottom_size_counted = false;
+ }
+
+ non_bottom_size += end_file->fd.file_size;
+
+ // Include all overlapping files in bottom level.
+ while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+ icmp_->Compare(bottom_files[bottom_end_idx]->smallest,
+ end_file->largest) < 0) {
+ if (!end_bottom_size_counted) {
+ bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+ end_bottom_size_counted = true;
+ }
+ if (icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+ end_file->largest) > 0) {
+ // next level file cross large boundary of current file.
+ break;
+ }
+ bottom_end_idx++;
+ end_bottom_size_counted = false;
+ }
+
+ if ((non_bottom_size + bottom_size > comp_thres_size ||
+ end_idx == static_cast<int>(files.size()) - 1) &&
+ non_bottom_size > 0) { // Do we alow 0 size file at all?
+ // If it is a better compaction, remember it in picked* variables.
+ double fanout = static_cast<double>(bottom_size) /
+ static_cast<double>(non_bottom_size);
+ if (fanout < picked_fanout) {
+ picked_start_idx = start_idx;
+ picked_end_idx = end_idx;
+ picked_fanout = fanout;
+ }
+ // Shrink from the start end to under comp_thres_size
+ while (non_bottom_size + bottom_size > comp_thres_size &&
+ start_idx <= end_idx) {
+ non_bottom_size -= files[start_idx]->fd.file_size;
+ start_idx++;
+ if (start_idx < static_cast<int>(files.size())) {
+ while (bottom_start_idx <= bottom_end_idx &&
+ icmp_->Compare(bottom_files[bottom_start_idx]->largest,
+ files[start_idx]->smallest) < 0) {
+ bottom_size -= bottom_files[bottom_start_idx]->fd.file_size;
+ bottom_start_idx++;
+ }
+ }
+ }
+ }
+ }
+
+ if (picked_fanout >= fanout_threshold) {
+ assert(picked_fanout == fanout_threshold);
+ return nullptr;
+ }
+
+ std::vector<CompactionInputFiles> inputs;
+ CompactionInputFiles bottom_level_inputs;
+ CompactionInputFiles second_last_level_inputs;
+ second_last_level_inputs.level = second_last_level;
+ bottom_level_inputs.level = output_level;
+ for (int i = picked_start_idx; i <= picked_end_idx; i++) {
+ if (files[i]->being_compacted) {
+ return nullptr;
+ }
+ second_last_level_inputs.files.push_back(files[i]);
+ }
+ assert(!second_last_level_inputs.empty());
+ if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+ &second_last_level_inputs,
+ /*next_smallest=*/nullptr)) {
+ return nullptr;
+ }
+ // We might be able to avoid this binary search if we save and expand
+ // from bottom_start_idx and bottom_end_idx, but for now, we use
+ // SetupOtherInputs() for simplicity.
+ int parent_index = -1; // Create and use bottom_start_idx?
+ if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+ &second_last_level_inputs,
+ &bottom_level_inputs, &parent_index,
+ /*base_index=*/-1)) {
+ return nullptr;
+ }
+
+ // Try to include files in upper levels if they fall into the range.
+ // Since we need to go from lower level up and this is in the reverse
+ // order, compared to level order, we first write to an reversed
+ // data structure and finally copy them to compaction inputs.
+ InternalKey smallest, largest;
+ picker_->GetRange(second_last_level_inputs, &smallest, &largest);
+ std::vector<CompactionInputFiles> inputs_reverse;
+ for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) {
+ SortedRun& sr = *it;
+ if (sr.level == 0) {
+ break;
+ }
+ std::vector<FileMetaData*> level_inputs;
+ vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest,
+ &level_inputs);
+ if (!level_inputs.empty()) {
+ inputs_reverse.push_back({});
+ inputs_reverse.back().level = sr.level;
+ inputs_reverse.back().files = level_inputs;
+ picker_->GetRange(inputs_reverse.back(), &smallest, &largest);
+ }
+ }
+ for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) {
+ inputs.push_back(*it);
+ }
+
+ inputs.push_back(second_last_level_inputs);
+ inputs.push_back(bottom_level_inputs);
+
+ int start_level = Compaction::kInvalidLevel;
+ for (const auto& in : inputs) {
+ if (!in.empty()) {
+ // inputs should already be sorted by level
+ start_level = in.level;
+ break;
+ }
+ }
+
+ // intra L0 compactions outputs could have overlap
+ if (output_level != 0 &&
+ picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+ start_level, output_level))) {
+ return nullptr;
+ }
+
+ // TODO support multi paths?
+ uint32_t path_id = 0;
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+ true /* enable_compression */),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+ true /* enable_compression */),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ /* trim_ts */ "", score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true,
+ CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+ CompactionInputFiles start_level_inputs;
+ int output_level;
+ std::vector<CompactionInputFiles> inputs;
+ std::vector<FileMetaData*> grandparents;
+
+ if (vstorage_->num_levels() == 1) {
+ // This is single level universal. Since we're basically trying to reclaim
+ // space by processing files marked for compaction due to high tombstone
+ // density, let's do the same thing as compaction to reduce size amp which
+ // has the same goals.
+ int start_index = -1;
+
+ start_level_inputs.level = 0;
+ start_level_inputs.files.clear();
+ output_level = 0;
+ // Find the first file marked for compaction. Ignore the last file
+ for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+ SortedRun* sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ continue;
+ }
+ FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+ if (f->marked_for_compaction) {
+ start_level_inputs.files.push_back(f);
+ start_index =
+ static_cast<int>(loop); // Consider this as the first candidate.
+ break;
+ }
+ }
+ if (start_index < 0) {
+ // Either no file marked, or they're already being compacted
+ return nullptr;
+ }
+
+ for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
+ SortedRun* sr = &sorted_runs_[loop];
+ if (sr->being_compacted) {
+ break;
+ }
+
+ FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+ start_level_inputs.files.push_back(f);
+ }
+ if (start_level_inputs.size() <= 1) {
+ // If only the last file in L0 is marked for compaction, ignore it
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ } else {
+ int start_level;
+
+ // For multi-level universal, the strategy is to make this look more like
+ // leveled. We pick one of the files marked for compaction and compact with
+ // overlapping files in the adjacent level.
+ picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+ &output_level, &start_level_inputs);
+ if (start_level_inputs.empty()) {
+ return nullptr;
+ }
+
+ // Pick the first non-empty level after the start_level
+ for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+ output_level++) {
+ if (vstorage_->NumLevelFiles(output_level) != 0) {
+ break;
+ }
+ }
+
+ // If all higher levels are empty, pick the highest level as output level
+ if (output_level == vstorage_->num_levels()) {
+ if (start_level == 0) {
+ output_level = vstorage_->num_levels() - 1;
+ } else {
+ // If start level is non-zero and all higher levels are empty, this
+ // compaction will translate into a trivial move. Since the idea is
+ // to reclaim space and trivial move doesn't help with that, we
+ // skip compaction in this case and return nullptr
+ return nullptr;
+ }
+ }
+ if (ioptions_.allow_ingest_behind &&
+ output_level == vstorage_->num_levels() - 1) {
+ assert(output_level > 1);
+ output_level--;
+ }
+
+ if (output_level != 0) {
+ if (start_level == 0) {
+ if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+ output_level, nullptr)) {
+ return nullptr;
+ }
+ }
+
+ CompactionInputFiles output_level_inputs;
+ int parent_index = -1;
+
+ output_level_inputs.level = output_level;
+ if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+ &start_level_inputs, &output_level_inputs,
+ &parent_index, -1)) {
+ return nullptr;
+ }
+ inputs.push_back(start_level_inputs);
+ if (!output_level_inputs.empty()) {
+ inputs.push_back(output_level_inputs);
+ }
+ if (picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(
+ vstorage_, ioptions_, start_level, output_level))) {
+ return nullptr;
+ }
+
+ picker_->GetGrandparents(vstorage_, start_level_inputs,
+ output_level_inputs, &grandparents);
+ } else {
+ inputs.push_back(start_level_inputs);
+ }
+ }
+
+ uint64_t estimated_total_size = 0;
+ // Use size of the output level as estimated file size
+ for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+ estimated_total_size += f->fd.GetFileSize();
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, grandparents, /* is manual */ false,
+ /* trim_ts */ "", score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true,
+ CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+ size_t start_index, CompactionReason compaction_reason) {
+ return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1,
+ compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
+ size_t start_index, size_t end_index, CompactionReason compaction_reason) {
+ assert(start_index < sorted_runs_.size());
+
+ // Estimate total file size
+ uint64_t estimated_total_size = 0;
+ for (size_t loop = start_index; loop <= end_index; loop++) {
+ estimated_total_size += sorted_runs_[loop].size;
+ }
+ uint32_t path_id =
+ GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+ int start_level = sorted_runs_[start_index].level;
+
+ std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ inputs[i].level = start_level + static_cast<int>(i);
+ }
+ for (size_t loop = start_index; loop <= end_index; loop++) {
+ auto& picking_sr = sorted_runs_[loop];
+ if (picking_sr.level == 0) {
+ FileMetaData* f = picking_sr.file;
+ inputs[0].files.push_back(f);
+ } else {
+ auto& files = inputs[picking_sr.level - start_level].files;
+ for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+ files.push_back(f);
+ }
+ }
+ std::string comp_reason_print_string;
+ if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+ comp_reason_print_string = "periodic compaction";
+ } else if (compaction_reason ==
+ CompactionReason::kUniversalSizeAmplification) {
+ comp_reason_print_string = "size amp";
+ } else {
+ assert(false);
+ comp_reason_print_string = "unknown: ";
+ comp_reason_print_string.append(
+ std::to_string(static_cast<int>(compaction_reason)));
+ }
+
+ char file_num_buf[256];
+ picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+ cf_name_.c_str(), comp_reason_print_string.c_str(),
+ file_num_buf);
+ }
+
+ int output_level;
+ if (end_index == sorted_runs_.size() - 1) {
+ // output files at the last level, unless it's reserved
+ output_level = vstorage_->num_levels() - 1;
+ // last level is reserved for the files ingested behind
+ if (ioptions_.allow_ingest_behind) {
+ assert(output_level > 1);
+ output_level--;
+ }
+ } else {
+ // if it's not including all sorted_runs, it can only output to the level
+ // above the `end_index + 1` sorted_run.
+ output_level = sorted_runs_[end_index + 1].level - 1;
+ }
+
+ // intra L0 compactions outputs could have overlap
+ if (output_level != 0 &&
+ picker_->FilesRangeOverlapWithCompaction(
+ inputs, output_level,
+ Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+ start_level, output_level))) {
+ return nullptr;
+ }
+
+ // We never check size for
+ // compaction_options_universal.compression_size_percent,
+ // because we always compact all the files, so always compress.
+ return new Compaction(
+ vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+ std::move(inputs), output_level,
+ MaxFileSizeForLevel(mutable_cf_options_, output_level,
+ kCompactionStyleUniversal),
+ GetMaxOverlappingBytes(), path_id,
+ GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+ true /* enable_compression */),
+ GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+ true /* enable_compression */),
+ Temperature::kUnknown,
+ /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+ /* trim_ts */ "", score_, false /* deletion_compaction */,
+ /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+ cf_name_.c_str());
+
+ // In universal compaction, sorted runs contain older data are almost always
+ // generated earlier too. To simplify the problem, we just try to trigger
+ // a full compaction. We start from the oldest sorted run and include
+ // all sorted runs, until we hit a sorted already being compacted.
+ // Since usually the largest (which is usually the oldest) sorted run is
+ // included anyway, doing a full compaction won't increase write
+ // amplification much.
+
+ // Get some information from marked files to check whether a file is
+ // included in the compaction.
+
+ size_t start_index = sorted_runs_.size();
+ while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+ start_index--;
+ }
+ if (start_index == sorted_runs_.size()) {
+ return nullptr;
+ }
+
+ // There is a rare corner case where we can't pick up all the files
+ // because some files are being compacted and we end up with picking files
+ // but none of them need periodic compaction. Unless we simply recompact
+ // the last sorted run (either the last level or last L0 file), we would just
+ // execute the compaction, in order to simplify the logic.
+ if (start_index == sorted_runs_.size() - 1) {
+ bool included_file_marked = false;
+ int start_level = sorted_runs_[start_index].level;
+ FileMetaData* start_file = sorted_runs_[start_index].file;
+ for (const std::pair<int, FileMetaData*>& level_file_pair :
+ vstorage_->FilesMarkedForPeriodicCompaction()) {
+ if (start_level != 0) {
+ // Last sorted run is a level
+ if (start_level == level_file_pair.first) {
+ included_file_marked = true;
+ break;
+ }
+ } else {
+ // Last sorted run is a L0 file.
+ if (start_file == level_file_pair.second) {
+ included_file_marked = true;
+ break;
+ }
+ }
+ }
+ if (!included_file_marked) {
+ ROCKS_LOG_BUFFER(log_buffer_,
+ "[%s] Universal: Cannot form a compaction covering file "
+ "marked for periodic compaction",
+ cf_name_.c_str());
+ return nullptr;
+ }
+ }
+
+ Compaction* c = PickCompactionToOldest(start_index,
+ CompactionReason::kPeriodicCompaction);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+ return c;
+}
+
+uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
+ if (!mutable_cf_options_.compaction_options_universal.incremental) {
+ return std::numeric_limits<uint64_t>::max();
+ } else {
+ // Try to align cutting boundary with files at the next level if the
+ // file isn't end up with 1/2 of target size, or it would overlap
+ // with two full size files at the next level.
+ return mutable_cf_options_.target_file_size_base / 2 * 3;
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..5f897cc9b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+ UniversalCompactionPicker(const ImmutableOptions& ioptions,
+ const InternalKeyComparator* icmp)
+ : CompactionPicker(ioptions, icmp) {}
+ virtual Compaction* PickCompaction(
+ const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer,
+ SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+ virtual bool NeedsCompaction(
+ const VersionStorageInfo* vstorage) const override;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_job.cc b/src/rocksdb/db/compaction/compaction_service_job.cc
new file mode 100644
index 000000000..1d2e99d99
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_job.cc
@@ -0,0 +1,829 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+#include "db/compaction/compaction_state.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/utilities/options_type.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+class SubcompactionState;
+
+CompactionServiceJobStatus
+CompactionJob::ProcessKeyValueCompactionWithCompactionService(
+ SubcompactionState* sub_compact) {
+ assert(sub_compact);
+ assert(sub_compact->compaction);
+ assert(db_options_.compaction_service);
+
+ const Compaction* compaction = sub_compact->compaction;
+ CompactionServiceInput compaction_input;
+ compaction_input.output_level = compaction->output_level();
+ compaction_input.db_id = db_id_;
+
+ const std::vector<CompactionInputFiles>& inputs =
+ *(compact_->compaction->inputs());
+ for (const auto& files_per_level : inputs) {
+ for (const auto& file : files_per_level.files) {
+ compaction_input.input_files.emplace_back(
+ MakeTableFileName(file->fd.GetNumber()));
+ }
+ }
+ compaction_input.column_family.name =
+ compaction->column_family_data()->GetName();
+ compaction_input.column_family.options =
+ compaction->column_family_data()->GetLatestCFOptions();
+ compaction_input.db_options =
+ BuildDBOptions(db_options_, mutable_db_options_copy_);
+ compaction_input.snapshots = existing_snapshots_;
+ compaction_input.has_begin = sub_compact->start.has_value();
+ compaction_input.begin =
+ compaction_input.has_begin ? sub_compact->start->ToString() : "";
+ compaction_input.has_end = sub_compact->end.has_value();
+ compaction_input.end =
+ compaction_input.has_end ? sub_compact->end->ToString() : "";
+
+ std::string compaction_input_binary;
+ Status s = compaction_input.Write(&compaction_input_binary);
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+
+ std::ostringstream input_files_oss;
+ bool is_first_one = true;
+ for (const auto& file : compaction_input.input_files) {
+ input_files_oss << (is_first_one ? "" : ", ") << file;
+ is_first_one = false;
+ }
+
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
+ compaction_input.column_family.name.c_str(), job_id_,
+ compaction_input.output_level, input_files_oss.str().c_str());
+ CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
+ GetCompactionId(sub_compact), thread_pri_);
+ CompactionServiceJobStatus compaction_status =
+ db_options_.compaction_service->StartV2(info, compaction_input_binary);
+ switch (compaction_status) {
+ case CompactionServiceJobStatus::kSuccess:
+ break;
+ case CompactionServiceJobStatus::kFailure:
+ sub_compact->status = Status::Incomplete(
+ "CompactionService failed to start compaction job.");
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction failed to start.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ case CompactionServiceJobStatus::kUseLocal:
+ ROCKS_LOG_INFO(
+ db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction fallback to local by API Start.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ default:
+ assert(false); // unknown status
+ break;
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Waiting for remote compaction...",
+ compaction_input.column_family.name.c_str(), job_id_);
+ std::string compaction_result_binary;
+ compaction_status = db_options_.compaction_service->WaitForCompleteV2(
+ info, &compaction_result_binary);
+
+ if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction fallback to local by API "
+ "WaitForComplete.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ }
+
+ CompactionServiceResult compaction_result;
+ s = CompactionServiceResult::Read(compaction_result_binary,
+ &compaction_result);
+
+ if (compaction_status == CompactionServiceJobStatus::kFailure) {
+ if (s.ok()) {
+ if (compaction_result.status.ok()) {
+ sub_compact->status = Status::Incomplete(
+ "CompactionService failed to run the compaction job (even though "
+ "the internal status is okay).");
+ } else {
+ // set the current sub compaction status with the status returned from
+ // remote
+ sub_compact->status = compaction_result.status;
+ }
+ } else {
+ sub_compact->status = Status::Incomplete(
+ "CompactionService failed to run the compaction job (and no valid "
+ "result is returned).");
+ compaction_result.status.PermitUncheckedError();
+ }
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "[%s] [JOB %d] Remote compaction failed.",
+ compaction_input.column_family.name.c_str(), job_id_);
+ return compaction_status;
+ }
+
+ if (!s.ok()) {
+ sub_compact->status = s;
+ compaction_result.status.PermitUncheckedError();
+ return CompactionServiceJobStatus::kFailure;
+ }
+ sub_compact->status = compaction_result.status;
+
+ std::ostringstream output_files_oss;
+ is_first_one = true;
+ for (const auto& file : compaction_result.output_files) {
+ output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
+ is_first_one = false;
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Receive remote compaction result, output path: "
+ "%s, files: %s",
+ compaction_input.column_family.name.c_str(), job_id_,
+ compaction_result.output_path.c_str(),
+ output_files_oss.str().c_str());
+
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+
+ for (const auto& file : compaction_result.output_files) {
+ uint64_t file_num = versions_->NewFileNumber();
+ auto src_file = compaction_result.output_path + "/" + file.file_name;
+ auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
+ file_num, compaction->output_path_id());
+ s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+
+ FileMetaData meta;
+ uint64_t file_size;
+ s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+ if (!s.ok()) {
+ sub_compact->status = s;
+ return CompactionServiceJobStatus::kFailure;
+ }
+ meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
+ file.smallest_seqno, file.largest_seqno);
+ meta.smallest.DecodeFrom(file.smallest_internal_key);
+ meta.largest.DecodeFrom(file.largest_internal_key);
+ meta.oldest_ancester_time = file.oldest_ancester_time;
+ meta.file_creation_time = file.file_creation_time;
+ meta.marked_for_compaction = file.marked_for_compaction;
+ meta.unique_id = file.unique_id;
+
+ auto cfd = compaction->column_family_data();
+ sub_compact->Current().AddOutput(std::move(meta),
+ cfd->internal_comparator(), false, false,
+ true, file.paranoid_hash);
+ }
+ sub_compact->compaction_job_stats = compaction_result.stats;
+ sub_compact->Current().SetNumOutputRecords(
+ compaction_result.num_output_records);
+ sub_compact->Current().SetTotalBytes(compaction_result.total_bytes);
+ RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
+ RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
+ compaction_result.bytes_written);
+ return CompactionServiceJobStatus::kSuccess;
+}
+
+std::string CompactionServiceCompactionJob::GetTableFileName(
+ uint64_t file_number) {
+ return MakeTableFileName(output_path_, file_number);
+}
+
+void CompactionServiceCompactionJob::RecordCompactionIOStats() {
+ compaction_result_->bytes_read += IOSTATS(bytes_read);
+ compaction_result_->bytes_written += IOSTATS(bytes_written);
+ CompactionJob::RecordCompactionIOStats();
+}
+
+CompactionServiceCompactionJob::CompactionServiceCompactionJob(
+ int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+ const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+ VersionSet* versions, const std::atomic<bool>* shutting_down,
+ LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
+ InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+ std::vector<SequenceNumber> existing_snapshots,
+ std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+ const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+ const std::atomic<bool>& manual_compaction_canceled,
+ const std::string& db_id, const std::string& db_session_id,
+ std::string output_path,
+ const CompactionServiceInput& compaction_service_input,
+ CompactionServiceResult* compaction_service_result)
+ : CompactionJob(
+ job_id, compaction, db_options, mutable_db_options, file_options,
+ versions, shutting_down, log_buffer, nullptr, output_directory,
+ nullptr, stats, db_mutex, db_error_handler,
+ std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr,
+ std::move(table_cache), event_logger,
+ compaction->mutable_cf_options()->paranoid_file_checks,
+ compaction->mutable_cf_options()->report_bg_io_stats, dbname,
+ &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+ manual_compaction_canceled, db_id, db_session_id,
+ compaction->column_family_data()->GetFullHistoryTsLow()),
+ output_path_(std::move(output_path)),
+ compaction_input_(compaction_service_input),
+ compaction_result_(compaction_service_result) {}
+
+Status CompactionServiceCompactionJob::Run() {
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+
+ auto* c = compact_->compaction;
+ assert(c->column_family_data() != nullptr);
+ assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+ compact_->compaction->level()) > 0);
+
+ write_hint_ =
+ c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+ bottommost_level_ = c->bottommost_level();
+
+ Slice begin = compaction_input_.begin;
+ Slice end = compaction_input_.end;
+ compact_->sub_compact_states.emplace_back(
+ c,
+ compaction_input_.has_begin ? std::optional<Slice>(begin)
+ : std::optional<Slice>(),
+ compaction_input_.has_end ? std::optional<Slice>(end)
+ : std::optional<Slice>(),
+ /*sub_job_id*/ 0);
+
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+ const uint64_t start_micros = db_options_.clock->NowMicros();
+ // Pick the only sub-compaction we should have
+ assert(compact_->sub_compact_states.size() == 1);
+ SubcompactionState* sub_compact = compact_->sub_compact_states.data();
+
+ ProcessKeyValueCompaction(sub_compact);
+
+ compaction_stats_.stats.micros =
+ db_options_.clock->NowMicros() - start_micros;
+ compaction_stats_.stats.cpu_micros =
+ sub_compact->compaction_job_stats.cpu_micros;
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME,
+ compaction_stats_.stats.micros);
+ RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+ compaction_stats_.stats.cpu_micros);
+
+ Status status = sub_compact->status;
+ IOStatus io_s = sub_compact->io_status;
+
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+
+ if (status.ok()) {
+ constexpr IODebugContext* dbg = nullptr;
+
+ if (output_directory_) {
+ io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
+ DirFsyncOptions());
+ }
+ }
+ if (io_status_.ok()) {
+ io_status_ = io_s;
+ }
+ if (status.ok()) {
+ status = io_s;
+ }
+ if (status.ok()) {
+ // TODO: Add verify_table()
+ }
+
+ // Finish up all book-keeping to unify the subcompaction results
+ compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+ UpdateCompactionStats();
+ RecordCompactionIOStats();
+
+ LogFlush(db_options_.info_log);
+ compact_->status = status;
+ compact_->status.PermitUncheckedError();
+
+ // Build compaction result
+ compaction_result_->output_level = compact_->compaction->output_level();
+ compaction_result_->output_path = output_path_;
+ for (const auto& output_file : sub_compact->GetOutputs()) {
+ auto& meta = output_file.meta;
+ compaction_result_->output_files.emplace_back(
+ MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
+ meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
+ meta.largest.Encode().ToString(), meta.oldest_ancester_time,
+ meta.file_creation_time, output_file.validator.GetHash(),
+ meta.marked_for_compaction, meta.unique_id);
+ }
+ InternalStats::CompactionStatsFull compaction_stats;
+ sub_compact->AggregateCompactionStats(compaction_stats);
+ compaction_result_->num_output_records =
+ compaction_stats.stats.num_output_records;
+ compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();
+
+ return status;
+}
+
+void CompactionServiceCompactionJob::CleanupCompaction() {
+ CompactionJob::CleanupCompaction();
+}
+
+// Internal binary format for the input and result data
+enum BinaryFormatVersion : uint32_t {
+ kOptionsString = 1, // Use string format similar to Option string format
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
+ {"name",
+ {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"options",
+ {offsetof(struct ColumnFamilyDescriptor, options),
+ OptionType::kConfigurable, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
+ return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
+ value, cf_options);
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
+ std::string result;
+ auto status =
+ GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
+ *value = "{" + result + "}";
+ return status;
+ },
+ [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
+ const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
+ auto this_conf = CFOptionsAsConfigurable(*this_one);
+ auto that_conf = CFOptionsAsConfigurable(*that_one);
+ std::string mismatch_opt;
+ bool result =
+ this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+ if (!result) {
+ *mismatch = name + "." + mismatch_opt;
+ }
+ return result;
+ }}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
+ {"column_family",
+ OptionTypeInfo::Struct(
+ "column_family", &cfd_type_info,
+ offsetof(struct CompactionServiceInput, column_family),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+ {"db_options",
+ {offsetof(struct CompactionServiceInput, db_options),
+ OptionType::kConfigurable, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto options = static_cast<DBOptions*>(addr);
+ return GetDBOptionsFromString(opts, DBOptions(), value, options);
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto options = static_cast<const DBOptions*>(addr);
+ std::string result;
+ auto status = GetStringFromDBOptions(opts, *options, &result);
+ *value = "{" + result + "}";
+ return status;
+ },
+ [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+ const void* addr2, std::string* mismatch) {
+ const auto this_one = static_cast<const DBOptions*>(addr1);
+ const auto that_one = static_cast<const DBOptions*>(addr2);
+ auto this_conf = DBOptionsAsConfigurable(*this_one);
+ auto that_conf = DBOptionsAsConfigurable(*that_one);
+ std::string mismatch_opt;
+ bool result =
+ this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+ if (!result) {
+ *mismatch = name + "." + mismatch_opt;
+ }
+ return result;
+ }}},
+ {"snapshots", OptionTypeInfo::Vector<uint64_t>(
+ offsetof(struct CompactionServiceInput, snapshots),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ {0, OptionType::kUInt64T})},
+ {"input_files", OptionTypeInfo::Vector<std::string>(
+ offsetof(struct CompactionServiceInput, input_files),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ {0, OptionType::kEncodedString})},
+ {"output_level",
+ {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"db_id",
+ {offsetof(struct CompactionServiceInput, db_id),
+ OptionType::kEncodedString}},
+ {"has_begin",
+ {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"begin",
+ {offsetof(struct CompactionServiceInput, begin),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"has_end",
+ {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"end",
+ {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ cs_output_file_type_info = {
+ {"file_name",
+ {offsetof(struct CompactionServiceOutputFile, file_name),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"smallest_seqno",
+ {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"largest_seqno",
+ {offsetof(struct CompactionServiceOutputFile, largest_seqno),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"smallest_internal_key",
+ {offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"largest_internal_key",
+ {offsetof(struct CompactionServiceOutputFile, largest_internal_key),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"oldest_ancester_time",
+ {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_creation_time",
+ {offsetof(struct CompactionServiceOutputFile, file_creation_time),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"paranoid_hash",
+ {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"marked_for_compaction",
+ {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"unique_id",
+ OptionTypeInfo::Array<uint64_t, 2>(
+ offsetof(struct CompactionServiceOutputFile, unique_id),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ {0, OptionType::kUInt64T})},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ compaction_job_stats_type_info = {
+ {"elapsed_micros",
+ {offsetof(struct CompactionJobStats, elapsed_micros),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"cpu_micros",
+ {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"num_input_records",
+ {offsetof(struct CompactionJobStats, num_input_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_blobs_read",
+ {offsetof(struct CompactionJobStats, num_blobs_read),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_input_files",
+ {offsetof(struct CompactionJobStats, num_input_files),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_input_files_at_output_level",
+ {offsetof(struct CompactionJobStats, num_input_files_at_output_level),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_records",
+ {offsetof(struct CompactionJobStats, num_output_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_files",
+ {offsetof(struct CompactionJobStats, num_output_files),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_files_blob",
+ {offsetof(struct CompactionJobStats, num_output_files_blob),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"is_full_compaction",
+ {offsetof(struct CompactionJobStats, is_full_compaction),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"is_manual_compaction",
+ {offsetof(struct CompactionJobStats, is_manual_compaction),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_input_bytes",
+ {offsetof(struct CompactionJobStats, total_input_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_blob_bytes_read",
+ {offsetof(struct CompactionJobStats, total_blob_bytes_read),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_output_bytes",
+ {offsetof(struct CompactionJobStats, total_output_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_output_bytes_blob",
+ {offsetof(struct CompactionJobStats, total_output_bytes_blob),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_records_replaced",
+ {offsetof(struct CompactionJobStats, num_records_replaced),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_input_raw_key_bytes",
+ {offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_input_raw_value_bytes",
+ {offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_input_deletion_records",
+ {offsetof(struct CompactionJobStats, num_input_deletion_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_expired_deletion_records",
+ {offsetof(struct CompactionJobStats, num_expired_deletion_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_corrupt_keys",
+ {offsetof(struct CompactionJobStats, num_corrupt_keys),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_write_nanos",
+ {offsetof(struct CompactionJobStats, file_write_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_range_sync_nanos",
+ {offsetof(struct CompactionJobStats, file_range_sync_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_fsync_nanos",
+ {offsetof(struct CompactionJobStats, file_fsync_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"file_prepare_write_nanos",
+ {offsetof(struct CompactionJobStats, file_prepare_write_nanos),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"smallest_output_key_prefix",
+ {offsetof(struct CompactionJobStats, smallest_output_key_prefix),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"largest_output_key_prefix",
+ {offsetof(struct CompactionJobStats, largest_output_key_prefix),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_single_del_fallthru",
+ {offsetof(struct CompactionJobStats, num_single_del_fallthru),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_single_del_mismatch",
+ {offsetof(struct CompactionJobStats, num_single_del_mismatch),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+};
+
+namespace {
+// this is a helper struct to serialize and deserialize class Status, because
+// Status's members are not public.
+struct StatusSerializationAdapter {
+ uint8_t code;
+ uint8_t subcode;
+ uint8_t severity;
+ std::string message;
+
+ StatusSerializationAdapter() = default;
+ explicit StatusSerializationAdapter(const Status& s) {
+ code = s.code();
+ subcode = s.subcode();
+ severity = s.severity();
+ auto msg = s.getState();
+ message = msg ? msg : "";
+ }
+
+ Status GetStatus() const {
+ return Status{static_cast<Status::Code>(code),
+ static_cast<Status::SubCode>(subcode),
+ static_cast<Status::Severity>(severity), message};
+ }
+};
+} // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ status_adapter_type_info = {
+ {"code",
+ {offsetof(struct StatusSerializationAdapter, code),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"subcode",
+ {offsetof(struct StatusSerializationAdapter, subcode),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"severity",
+ {offsetof(struct StatusSerializationAdapter, severity),
+ OptionType::kUInt8T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"message",
+ {offsetof(struct StatusSerializationAdapter, message),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
+ {"status",
+ {offsetof(struct CompactionServiceResult, status),
+ OptionType::kCustomizable, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ auto status_obj = static_cast<Status*>(addr);
+ StatusSerializationAdapter adapter;
+ Status s = OptionTypeInfo::ParseType(
+ opts, value, status_adapter_type_info, &adapter);
+ *status_obj = adapter.GetStatus();
+ return s;
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr, std::string* value) {
+ const auto status_obj = static_cast<const Status*>(addr);
+ StatusSerializationAdapter adapter(*status_obj);
+ std::string result;
+ Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
+ &adapter, &result);
+ *value = "{" + result + "}";
+ return s;
+ },
+ [](const ConfigOptions& opts, const std::string& /*name*/,
+ const void* addr1, const void* addr2, std::string* mismatch) {
+ const auto status1 = static_cast<const Status*>(addr1);
+ const auto status2 = static_cast<const Status*>(addr2);
+
+ StatusSerializationAdapter adatper1(*status1);
+ StatusSerializationAdapter adapter2(*status2);
+ return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
+ &adatper1, &adapter2, mismatch);
+ }}},
+ {"output_files",
+ OptionTypeInfo::Vector<CompactionServiceOutputFile>(
+ offsetof(struct CompactionServiceResult, output_files),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+ OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
+ OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone))},
+ {"output_level",
+ {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"output_path",
+ {offsetof(struct CompactionServiceResult, output_path),
+ OptionType::kEncodedString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"num_output_records",
+ {offsetof(struct CompactionServiceResult, num_output_records),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"total_bytes",
+ {offsetof(struct CompactionServiceResult, total_bytes),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"bytes_read",
+ {offsetof(struct CompactionServiceResult, bytes_read),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"bytes_written",
+ {offsetof(struct CompactionServiceResult, bytes_written),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"stats", OptionTypeInfo::Struct(
+ "stats", &compaction_job_stats_type_info,
+ offsetof(struct CompactionServiceResult, stats),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
+Status CompactionServiceInput::Read(const std::string& data_str,
+ CompactionServiceInput* obj) {
+ if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+ return Status::InvalidArgument("Invalid CompactionServiceInput string");
+ }
+ auto format_version = DecodeFixed32(data_str.data());
+ if (format_version == kOptionsString) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ cf.ignore_unknown_options = true;
+ return OptionTypeInfo::ParseType(
+ cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
+ obj);
+ } else {
+ return Status::NotSupported(
+ "Compaction Service Input data version not supported: " +
+ std::to_string(format_version));
+ }
+}
+
+Status CompactionServiceInput::Write(std::string* output) {
+ char buf[sizeof(BinaryFormatVersion)];
+ EncodeFixed32(buf, kOptionsString);
+ output->append(buf, sizeof(BinaryFormatVersion));
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
+}
+
+Status CompactionServiceResult::Read(const std::string& data_str,
+ CompactionServiceResult* obj) {
+ if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+ return Status::InvalidArgument("Invalid CompactionServiceResult string");
+ }
+ auto format_version = DecodeFixed32(data_str.data());
+ if (format_version == kOptionsString) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ cf.ignore_unknown_options = true;
+ return OptionTypeInfo::ParseType(
+ cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
+ obj);
+ } else {
+ return Status::NotSupported(
+ "Compaction Service Result data version not supported: " +
+ std::to_string(format_version));
+ }
+}
+
+Status CompactionServiceResult::Write(std::string* output) {
+ char buf[sizeof(BinaryFormatVersion)];
+ EncodeFixed32(buf, kOptionsString);
+ output->append(buf, sizeof(BinaryFormatVersion));
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
+}
+
+#ifndef NDEBUG
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
+ std::string mismatch;
+ return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
+ std::string* mismatch) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
+ mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
+ std::string mismatch;
+ return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
+ std::string* mismatch) {
+ ConfigOptions cf;
+ cf.invoke_prepare_options = false;
+ return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
+ mismatch);
+}
+#endif // NDEBUG
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_test.cc b/src/rocksdb/db/compaction/compaction_service_test.cc
new file mode 100644
index 000000000..c475c4e3b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_test.cc
@@ -0,0 +1,966 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "table/unique_id_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MyTestCompactionService : public CompactionService {
+ public:
+ MyTestCompactionService(
+ std::string db_path, Options& options,
+ std::shared_ptr<Statistics>& statistics,
+ std::vector<std::shared_ptr<EventListener>>& listeners,
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ table_properties_collector_factories)
+ : db_path_(std::move(db_path)),
+ options_(options),
+ statistics_(statistics),
+ start_info_("na", "na", "na", 0, Env::TOTAL),
+ wait_info_("na", "na", "na", 0, Env::TOTAL),
+ listeners_(listeners),
+ table_properties_collector_factories_(
+ std::move(table_properties_collector_factories)) {}
+
+ static const char* kClassName() { return "MyTestCompactionService"; }
+
+ const char* Name() const override { return kClassName(); }
+
+ CompactionServiceJobStatus StartV2(
+ const CompactionServiceJobInfo& info,
+ const std::string& compaction_service_input) override {
+ InstrumentedMutexLock l(&mutex_);
+ start_info_ = info;
+ assert(info.db_name == db_path_);
+ jobs_.emplace(info.job_id, compaction_service_input);
+ CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+ if (is_override_start_status_) {
+ return override_start_status_;
+ }
+ return s;
+ }
+
+ CompactionServiceJobStatus WaitForCompleteV2(
+ const CompactionServiceJobInfo& info,
+ std::string* compaction_service_result) override {
+ std::string compaction_input;
+ assert(info.db_name == db_path_);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ wait_info_ = info;
+ auto i = jobs_.find(info.job_id);
+ if (i == jobs_.end()) {
+ return CompactionServiceJobStatus::kFailure;
+ }
+ compaction_input = std::move(i->second);
+ jobs_.erase(i);
+ }
+
+ if (is_override_wait_status_) {
+ return override_wait_status_;
+ }
+
+ CompactionServiceOptionsOverride options_override;
+ options_override.env = options_.env;
+ options_override.file_checksum_gen_factory =
+ options_.file_checksum_gen_factory;
+ options_override.comparator = options_.comparator;
+ options_override.merge_operator = options_.merge_operator;
+ options_override.compaction_filter = options_.compaction_filter;
+ options_override.compaction_filter_factory =
+ options_.compaction_filter_factory;
+ options_override.prefix_extractor = options_.prefix_extractor;
+ options_override.table_factory = options_.table_factory;
+ options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+ options_override.statistics = statistics_;
+ if (!listeners_.empty()) {
+ options_override.listeners = listeners_;
+ }
+
+ if (!table_properties_collector_factories_.empty()) {
+ options_override.table_properties_collector_factories =
+ table_properties_collector_factories_;
+ }
+
+ OpenAndCompactOptions options;
+ options.canceled = &canceled_;
+
+ Status s = DB::OpenAndCompact(
+ options, db_path_, db_path_ + "/" + std::to_string(info.job_id),
+ compaction_input, compaction_service_result, options_override);
+ if (is_override_wait_result_) {
+ *compaction_service_result = override_wait_result_;
+ }
+ compaction_num_.fetch_add(1);
+ if (s.ok()) {
+ return CompactionServiceJobStatus::kSuccess;
+ } else {
+ return CompactionServiceJobStatus::kFailure;
+ }
+ }
+
+ int GetCompactionNum() { return compaction_num_.load(); }
+
+ CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
+ CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; }
+
+ void OverrideStartStatus(CompactionServiceJobStatus s) {
+ is_override_start_status_ = true;
+ override_start_status_ = s;
+ }
+
+ void OverrideWaitStatus(CompactionServiceJobStatus s) {
+ is_override_wait_status_ = true;
+ override_wait_status_ = s;
+ }
+
+ void OverrideWaitResult(std::string str) {
+ is_override_wait_result_ = true;
+ override_wait_result_ = std::move(str);
+ }
+
+ void ResetOverride() {
+ is_override_wait_result_ = false;
+ is_override_start_status_ = false;
+ is_override_wait_status_ = false;
+ }
+
+ void SetCanceled(bool canceled) { canceled_ = canceled; }
+
+ private:
+ InstrumentedMutex mutex_;
+ std::atomic_int compaction_num_{0};
+ std::map<uint64_t, std::string> jobs_;
+ const std::string db_path_;
+ Options options_;
+ std::shared_ptr<Statistics> statistics_;
+ CompactionServiceJobInfo start_info_;
+ CompactionServiceJobInfo wait_info_;
+ bool is_override_start_status_ = false;
+ CompactionServiceJobStatus override_start_status_ =
+ CompactionServiceJobStatus::kFailure;
+ bool is_override_wait_status_ = false;
+ CompactionServiceJobStatus override_wait_status_ =
+ CompactionServiceJobStatus::kFailure;
+ bool is_override_wait_result_ = false;
+ std::string override_wait_result_;
+ std::vector<std::shared_ptr<EventListener>> listeners_;
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ table_properties_collector_factories_;
+ std::atomic_bool canceled_{false};
+};
+
+class CompactionServiceTest : public DBTestBase {
+ public:
+ explicit CompactionServiceTest()
+ : DBTestBase("compaction_service_test", true) {}
+
+ protected:
+ void ReopenWithCompactionService(Options* options) {
+ options->env = env_;
+ primary_statistics_ = CreateDBStatistics();
+ options->statistics = primary_statistics_;
+ compactor_statistics_ = CreateDBStatistics();
+
+ compaction_service_ = std::make_shared<MyTestCompactionService>(
+ dbname_, *options, compactor_statistics_, remote_listeners,
+ remote_table_properties_collector_factories);
+ options->compaction_service = compaction_service_;
+ DestroyAndReopen(*options);
+ }
+
+ Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); }
+
+ Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); }
+
+ MyTestCompactionService* GetCompactionService() {
+ CompactionService* cs = compaction_service_.get();
+ return static_cast_with_check<MyTestCompactionService>(cs);
+ }
+
+ void GenerateTestData() {
+ // Generate 20 files @ L2
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(2);
+
+ // Generate 10 files @ L1 overlap with all 20 files @ L2
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ MoveFilesToLevel(1);
+ ASSERT_EQ(FilesPerLevel(), "0,10,20");
+ }
+
+ void VerifyTestData() {
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ }
+
+ std::vector<std::shared_ptr<EventListener>> remote_listeners;
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ remote_table_properties_collector_factories;
+
+ private:
+ std::shared_ptr<Statistics> compactor_statistics_;
+ std::shared_ptr<Statistics> primary_statistics_;
+ std::shared_ptr<CompactionService> compaction_service_;
+};
+
+TEST_F(CompactionServiceTest, BasicCompactions) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ Statistics* primary_statistics = GetPrimaryStatistics();
+ Statistics* compactor_statistics = GetCompactorStatistics();
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ auto my_cs = GetCompactionService();
+ ASSERT_GE(my_cs->GetCompactionNum(), 1);
+
+ // make sure the compaction statistics is only recorded on the remote side
+ ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
+ ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+ ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+ // even with remote compaction, primary host still needs to read SST files to
+ // `verify_table()`.
+ ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+ // all the compaction write happens on the remote side
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+ ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1);
+ ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES),
+ primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES));
+ // compactor is already the remote side, which doesn't have remote
+ ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+ ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ 0);
+
+ // Test failed compaction
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+ // override job status
+ auto s = static_cast<Status*>(status);
+ *s = Status::Aborted("MyTestCompactionService failed to compact!");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Status s;
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ s = Put(Key(key_id), "value_new" + std::to_string(key_id));
+ if (s.IsAborted()) {
+ break;
+ }
+ }
+ if (s.IsAborted()) {
+ break;
+ }
+ s = Flush();
+ if (s.IsAborted()) {
+ break;
+ }
+ s = dbfull()->TEST_WaitForCompact();
+ if (s.IsAborted()) {
+ break;
+ }
+ }
+ ASSERT_TRUE(s.IsAborted());
+
+ // Test re-open and successful unique id verification
+ std::atomic_int verify_passed{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+ // override job status
+ auto id = static_cast<UniqueId64x2*>(arg);
+ assert(*id != kNullUniqueId64x2);
+ verify_passed++;
+ });
+ Reopen(options);
+ ASSERT_GT(verify_passed, 0);
+ Close();
+}
+
+TEST_F(CompactionServiceTest, ManualCompaction) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ uint64_t comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+
+ start_str = Key(120);
+ start = start_str;
+ comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+
+ end_str = Key(92);
+ end = end_str;
+ comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+
+ comp_num = my_cs->GetCompactionNum();
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ uint64_t comp_num = my_cs->GetCompactionNum();
+
+ // Test cancel compaction at the beginning
+ my_cs->SetCanceled(true);
+ auto s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_TRUE(s.IsIncomplete());
+ // compaction number is not increased
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+ VerifyTestData();
+
+ // Test cancel compaction in progress
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+ my_cs = GetCompactionService();
+ my_cs->SetCanceled(false);
+
+ std::atomic_bool cancel_issued{false};
+ SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Inprogress",
+ [&](void* /*arg*/) {
+ cancel_issued = true;
+ my_cs->SetCanceled(true);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_TRUE(s.IsIncomplete());
+ ASSERT_TRUE(cancel_issued);
+ // compaction number is not increased
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+ VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, FailedToStart) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+ my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure);
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_TRUE(s.IsIncomplete());
+}
+
+TEST_F(CompactionServiceTest, InvalidResult) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+
+ auto my_cs = GetCompactionService();
+ my_cs->OverrideWaitResult("Invalid Str");
+
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+ ASSERT_FALSE(s.ok());
+}
+
+TEST_F(CompactionServiceTest, SubCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = 10;
+ options.target_file_size_base = 1 << 10; // 1KB
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+ VerifyTestData();
+
+ auto my_cs = GetCompactionService();
+ int compaction_num_before = my_cs->GetCompactionNum();
+
+ auto cro = CompactRangeOptions();
+ cro.max_subcompactions = 10;
+ Status s = db_->CompactRange(cro, nullptr, nullptr);
+ ASSERT_OK(s);
+ VerifyTestData();
+ int compaction_num = my_cs->GetCompactionNum() - compaction_num_before;
+ // make sure there's sub-compaction by checking the compaction number
+ ASSERT_GE(compaction_num, 2);
+}
+
+class PartialDeleteCompactionFilter : public CompactionFilter {
+ public:
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& key, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ int i = std::stoi(key.ToString().substr(3));
+ if (i > 5 && i <= 105) {
+ return CompactionFilter::Decision::kRemove;
+ }
+ return CompactionFilter::Decision::kKeep;
+ }
+
+ const char* Name() const override { return "PartialDeleteCompactionFilter"; }
+};
+
+TEST_F(CompactionServiceTest, CompactionFilter) {
+ Options options = CurrentOptions();
+ std::unique_ptr<CompactionFilter> delete_comp_filter(
+ new PartialDeleteCompactionFilter());
+ options.compaction_filter = delete_comp_filter.get();
+ ReopenWithCompactionService(&options);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i > 5 && i <= 105) {
+ ASSERT_EQ(result, "NOT_FOUND");
+ } else if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ auto my_cs = GetCompactionService();
+ ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_F(CompactionServiceTest, Snapshot) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ ASSERT_OK(Put(Key(1), "value1"));
+ ASSERT_OK(Put(Key(2), "value1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(Key(1), "value2"));
+ ASSERT_OK(Put(Key(3), "value2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ auto my_cs = GetCompactionService();
+ ASSERT_GE(my_cs->GetCompactionNum(), 1);
+ ASSERT_EQ("value1", Get(Key(1), s1));
+ ASSERT_EQ("value2", Get(Key(1)));
+ db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(CompactionServiceTest, ConcurrentCompaction) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 100;
+ options.max_background_jobs = 20;
+ ReopenWithCompactionService(&options);
+ GenerateTestData();
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+
+ std::vector<std::thread> threads;
+ for (const auto& file : meta.levels[1].files) {
+ threads.emplace_back(std::thread([&]() {
+ std::string fname = file.db_path + "/" + file.name;
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2));
+ }));
+ }
+
+ for (auto& thread : threads) {
+ thread.join();
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+ auto my_cs = GetCompactionService();
+ ASSERT_EQ(my_cs->GetCompactionNum(), 10);
+ ASSERT_EQ(FilesPerLevel(), "0,0,10");
+}
+
+TEST_F(CompactionServiceTest, CompactionInfo) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ auto my_cs =
+ static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+ uint64_t comp_num = my_cs->GetCompactionNum();
+ ASSERT_GE(comp_num, 1);
+
+ CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+ ASSERT_EQ(dbname_, info.db_name);
+ std::string db_id, db_session_id;
+ ASSERT_OK(db_->GetDbIdentity(db_id));
+ ASSERT_EQ(db_id, info.db_id);
+ ASSERT_OK(db_->GetDbSessionId(db_session_id));
+ ASSERT_EQ(db_session_id, info.db_session_id);
+ ASSERT_EQ(Env::LOW, info.priority);
+ info = my_cs->GetCompactionInfoForWait();
+ ASSERT_EQ(dbname_, info.db_name);
+ ASSERT_EQ(db_id, info.db_id);
+ ASSERT_EQ(db_session_id, info.db_session_id);
+ ASSERT_EQ(Env::LOW, info.priority);
+
+ // Test priority USER
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ SstFileMetaData file = meta.levels[1].files[0];
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(),
+ {file.db_path + "/" + file.name}, 2));
+ info = my_cs->GetCompactionInfoForStart();
+ ASSERT_EQ(Env::USER, info.priority);
+ info = my_cs->GetCompactionInfoForWait();
+ ASSERT_EQ(Env::USER, info.priority);
+
+ // Test priority BOTTOM
+ env_->SetBackgroundThreads(1, Env::BOTTOM);
+ options.num_levels = 2;
+ ReopenWithCompactionService(&options);
+ my_cs =
+ static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ info = my_cs->GetCompactionInfoForStart();
+ ASSERT_EQ(Env::BOTTOM, info.priority);
+ info = my_cs->GetCompactionInfoForWait();
+ ASSERT_EQ(Env::BOTTOM, info.priority);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalAuto) {
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ auto my_cs = GetCompactionService();
+ Statistics* compactor_statistics = GetCompactorStatistics();
+ Statistics* primary_statistics = GetPrimaryStatistics();
+ uint64_t compactor_write_bytes =
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+ uint64_t primary_write_bytes =
+ primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+ my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+
+ ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+
+ // make sure the compaction statistics is only recorded on the local side
+ ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+ ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ primary_write_bytes);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalManual) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ ReopenWithCompactionService(&options);
+
+ GenerateTestData();
+ VerifyTestData();
+
+ auto my_cs = GetCompactionService();
+ Statistics* compactor_statistics = GetCompactorStatistics();
+ Statistics* primary_statistics = GetPrimaryStatistics();
+ uint64_t compactor_write_bytes =
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+ uint64_t primary_write_bytes =
+ primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+ // re-enable remote compaction
+ my_cs->ResetOverride();
+ std::string start_str = Key(15);
+ std::string end_str = Key(45);
+ Slice start(start_str);
+ Slice end(end_str);
+ uint64_t comp_num = my_cs->GetCompactionNum();
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+ ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+ // make sure the compaction statistics is only recorded on the remote side
+ ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+ ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ primary_write_bytes);
+
+ // return run local again with API WaitForComplete
+ my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal);
+ start_str = Key(120);
+ start = start_str;
+ comp_num = my_cs->GetCompactionNum();
+ compactor_write_bytes =
+ compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+ primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+ ASSERT_EQ(my_cs->GetCompactionNum(),
+ comp_num); // no remote compaction is run
+ // make sure the compaction statistics is only recorded on the local side
+ ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+ ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+ primary_write_bytes);
+ ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+ compactor_write_bytes);
+
+ // verify result after 2 manual compactions
+ VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, RemoteEventListener) {
+ class RemoteEventListenerTest : public EventListener {
+ public:
+ const char* Name() const override { return "RemoteEventListenerTest"; }
+
+ void OnSubcompactionBegin(const SubcompactionJobInfo& info) override {
+ auto result = on_going_compactions.emplace(info.job_id);
+ ASSERT_TRUE(result.second); // make sure there's no duplication
+ compaction_num++;
+ EventListener::OnSubcompactionBegin(info);
+ }
+ void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override {
+ auto num = on_going_compactions.erase(info.job_id);
+ ASSERT_TRUE(num == 1); // make sure the compaction id exists
+ EventListener::OnSubcompactionCompleted(info);
+ }
+ void OnTableFileCreated(const TableFileCreationInfo& info) override {
+ ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+ file_created++;
+ EventListener::OnTableFileCreated(info);
+ }
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& info) override {
+ ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+ file_creation_started++;
+ EventListener::OnTableFileCreationStarted(info);
+ }
+
+ bool ShouldBeNotifiedOnFileIO() override {
+ file_io_notified++;
+ return EventListener::ShouldBeNotifiedOnFileIO();
+ }
+
+ std::atomic_uint64_t file_io_notified{0};
+ std::atomic_uint64_t file_creation_started{0};
+ std::atomic_uint64_t file_created{0};
+
+ std::set<int> on_going_compactions; // store the job_id
+ std::atomic_uint64_t compaction_num{0};
+ };
+
+ auto listener = new RemoteEventListenerTest();
+ remote_listeners.emplace_back(listener);
+
+ Options options = CurrentOptions();
+ ReopenWithCompactionService(&options);
+
+ for (int i = 0; i < 20; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 10 + j;
+ ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ for (int i = 0; i < 10; i++) {
+ for (int j = 0; j < 10; j++) {
+ int key_id = i * 20 + j * 2;
+ ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // check the events are triggered
+ ASSERT_TRUE(listener->file_io_notified > 0);
+ ASSERT_TRUE(listener->file_creation_started > 0);
+ ASSERT_TRUE(listener->file_created > 0);
+ ASSERT_TRUE(listener->compaction_num > 0);
+ ASSERT_TRUE(listener->on_going_compactions.empty());
+
+ // verify result
+ for (int i = 0; i < 200; i++) {
+ auto result = Get(Key(i));
+ if (i % 2) {
+ ASSERT_EQ(result, "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(result, "value_new" + std::to_string(i));
+ }
+ }
+}
+
+TEST_F(CompactionServiceTest, TablePropertiesCollector) {
+ const static std::string kUserPropertyName = "TestCount";
+
+ class TablePropertiesCollectorTest : public TablePropertiesCollector {
+ public:
+ Status Finish(UserCollectedProperties* properties) override {
+ *properties = UserCollectedProperties{
+ {kUserPropertyName, std::to_string(count_)},
+ };
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties();
+ }
+
+ const char* Name() const override { return "TablePropertiesCollectorTest"; }
+
+ Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ count_++;
+ return Status::OK();
+ }
+
+ private:
+ uint32_t count_ = 0;
+ };
+
+ class TablePropertiesCollectorFactoryTest
+ : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new TablePropertiesCollectorTest();
+ }
+
+ const char* Name() const override {
+ return "TablePropertiesCollectorFactoryTest";
+ }
+ };
+
+ auto factory = new TablePropertiesCollectorFactoryTest();
+ remote_table_properties_collector_factories.emplace_back(factory);
+
+ const int kNumSst = 3;
+ const int kLevel0Trigger = 4;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
+ ReopenWithCompactionService(&options);
+
+ // generate a few SSTs locally which should not have user property
+ for (int i = 0; i < kNumSst; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ TablePropertiesCollection fname_to_props;
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+ for (const auto& file_props : fname_to_props) {
+ auto properties = file_props.second->user_collected_properties;
+ auto it = properties.find(kUserPropertyName);
+ ASSERT_EQ(it, properties.end());
+ }
+
+ // trigger compaction
+ for (int i = kNumSst; i < kLevel0Trigger; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value"));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+ ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+
+ bool has_user_property = false;
+ for (const auto& file_props : fname_to_props) {
+ auto properties = file_props.second->user_collected_properties;
+ auto it = properties.find(kUserPropertyName);
+ if (it != properties.end()) {
+ has_user_property = true;
+ ASSERT_GT(std::stoi(it->second), 0);
+ }
+ }
+ ASSERT_TRUE(has_user_property);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_state.cc b/src/rocksdb/db/compaction/compaction_state.cc
new file mode 100644
index 000000000..ee4b0c189
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.cc
@@ -0,0 +1,46 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_state.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Slice CompactionState::SmallestUserKey() {
+ for (const auto& sub_compact_state : sub_compact_states) {
+ Slice smallest = sub_compact_state.SmallestUserKey();
+ if (!smallest.empty()) {
+ return smallest;
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice{nullptr, 0};
+}
+
+Slice CompactionState::LargestUserKey() {
+ for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+ ++it) {
+ Slice largest = it->LargestUserKey();
+ if (!largest.empty()) {
+ return largest;
+ }
+ }
+ // If there is no finished output, return an empty slice.
+ return Slice{nullptr, 0};
+}
+
+void CompactionState::AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats,
+ CompactionJobStats& compaction_job_stats) {
+ for (const auto& sc : sub_compact_states) {
+ sc.AggregateCompactionStats(compaction_stats);
+ compaction_job_stats.Add(sc.compaction_job_stats);
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_state.h b/src/rocksdb/db/compaction/compaction_state.h
new file mode 100644
index 000000000..cc5b66c68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.h
@@ -0,0 +1,42 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/subcompaction_state.h"
+#include "db/internal_stats.h"
+
+// Data structures used for compaction_job and compaction_service_job which has
+// the list of sub_compact_states and the aggregated information for the
+// compaction.
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state for the entire compaction
+class CompactionState {
+ public:
+ Compaction* const compaction;
+
+ // REQUIRED: subcompaction states are stored in order of increasing key-range
+ std::vector<SubcompactionState> sub_compact_states;
+ Status status;
+
+ void AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats,
+ CompactionJobStats& compaction_job_stats);
+
+ explicit CompactionState(Compaction* c) : compaction(c) {}
+
+ Slice SmallestUserKey();
+
+ Slice LargestUserKey();
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/file_pri.h b/src/rocksdb/db/compaction/file_pri.h
new file mode 100644
index 000000000..82dddcf93
--- /dev/null
+++ b/src/rocksdb/db/compaction/file_pri.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include <algorithm>
+
+#include "db/version_edit.h"
+
+namespace ROCKSDB_NAMESPACE {
+// We boost files that are closer to TTL limit. This boosting could be
+// through FileMetaData.compensated_file_size but this compensated size
+// is widely used as something similar to file size so dramatically boost
+// the value might cause unintended consequences.
+//
+// This boosting algorithm can go very fancy, but here we use a simple
+// formula which can satisify:
+// (1) Different levels are triggered slightly differently to avoid
+// too many cascading cases
+// (2) Files in the same level get boosting more when TTL gets closer.
+//
+// Don't do any boosting before TTL has past by half. This is to make
+// sure lower write amp for most of the case. And all levels should be
+// fully boosted when total TTL compaction threshold triggers.
+// Differientiate boosting ranges of each level by 1/2. This will make
+// range for each level exponentially increasing. We could do it by
+// having them to be equal, or go even fancier. We can adjust it after
+// we observe the behavior in production.
+// The threshold starting boosting:
+// +------------------------------------------------------------------ +
+// ^ ^ ^ ^ ^ ^
+// Age 0 ... | | second last level thresold
+// | |
+// | third last level
+// |
+// forth last level
+//
+// We arbitrarily set with 0 when a file is aged boost_age_start and
+// grow linearly. The ratio is arbitrarily set so that when the next level
+// starts to boost, the previous level's boosting amount is 16.
+class FileTtlBooster {
+ public:
+ FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels,
+ int level)
+ : current_time_(current_time) {
+ if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) {
+ enabled_ = false;
+ boost_age_start_ = 0;
+ boost_step_ = 1;
+ } else {
+ enabled_ = true;
+ uint64_t all_boost_start_age = ttl / 2;
+ uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age;
+ uint64_t boost_age_range =
+ all_boost_age_range >> (num_non_empty_levels - level - 1);
+ boost_age_start_ = all_boost_start_age + boost_age_range;
+ const uint64_t kBoostRatio = 16;
+ // prevent 0 value to avoid divide 0 error.
+ boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1});
+ }
+ }
+
+ uint64_t GetBoostScore(FileMetaData* f) {
+ if (!enabled_) {
+ return 1;
+ }
+ uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+ if (oldest_ancester_time >= current_time_) {
+ return 1;
+ }
+ uint64_t age = current_time_ - oldest_ancester_time;
+ if (age > boost_age_start_) {
+ // Use integer just for convenience.
+ // We could make all file_to_order double if we want.
+ // Technically this can overflow if users override timing and
+ // give a very high current time. Ignore the case for simplicity.
+ // Boosting is addition to current value, so +1. This will effectively
+ // make boosting to kick in after the first boost_step_ is reached.
+ return (age - boost_age_start_) / boost_step_ + 1;
+ }
+ return 1;
+ }
+
+ private:
+ bool enabled_;
+ uint64_t current_time_;
+ uint64_t boost_age_start_;
+ uint64_t boost_step_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/sst_partitioner.cc b/src/rocksdb/db/compaction/sst_partitioner.cc
new file mode 100644
index 000000000..9e7f9fa89
--- /dev/null
+++ b/src/rocksdb/db/compaction/sst_partitioner.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <algorithm>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo>
+ sst_fixed_prefix_type_info = {
+#ifndef ROCKSDB_LITE
+ {"length",
+ {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+
+SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
+ : len_(len) {
+ RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info);
+}
+
+PartitionerResult SstPartitionerFixedPrefix::ShouldPartition(
+ const PartitionerRequest& request) {
+ Slice last_key_fixed(*request.prev_user_key);
+ if (last_key_fixed.size() > len_) {
+ last_key_fixed.size_ = len_;
+ }
+ Slice current_key_fixed(*request.current_user_key);
+ if (current_key_fixed.size() > len_) {
+ current_key_fixed.size_ = len_;
+ }
+ return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired
+ : kNotRequired;
+}
+
+bool SstPartitionerFixedPrefix::CanDoTrivialMove(
+ const Slice& smallest_user_key, const Slice& largest_user_key) {
+ return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key,
+ 0)) == kNotRequired;
+}
+
+std::unique_ptr<SstPartitioner>
+SstPartitionerFixedPrefixFactory::CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const {
+ return std::unique_ptr<SstPartitioner>(new SstPartitionerFixedPrefix(len_));
+}
+
+std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
+ size_t prefix_len) {
+ return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterSstPartitionerFactories(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<SstPartitionerFactory>(
+ SstPartitionerFixedPrefixFactory::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<SstPartitionerFactory>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new SstPartitionerFixedPrefixFactory(0));
+ return guard->get();
+ });
+ return 1;
+}
+} // namespace
+#endif // ROCKSDB_LITE
+
+Status SstPartitionerFactory::CreateFromString(
+ const ConfigOptions& options, const std::string& value,
+ std::shared_ptr<SstPartitionerFactory>* result) {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
+ result);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.cc b/src/rocksdb/db/compaction/subcompaction_state.cc
new file mode 100644
index 000000000..0c56471e9
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.cc
@@ -0,0 +1,106 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/subcompaction_state.h"
+
+#include "rocksdb/sst_partitioner.h"
+
+namespace ROCKSDB_NAMESPACE {
+void SubcompactionState::AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats) const {
+ compaction_stats.stats.Add(compaction_outputs_.stats_);
+ if (HasPenultimateLevelOutputs()) {
+ compaction_stats.has_penultimate_level_output = true;
+ compaction_stats.penultimate_level_stats.Add(
+ penultimate_level_outputs_.stats_);
+ }
+}
+
+OutputIterator SubcompactionState::GetOutputs() const {
+ return OutputIterator(penultimate_level_outputs_.outputs_,
+ compaction_outputs_.outputs_);
+}
+
+void SubcompactionState::Cleanup(Cache* cache) {
+ penultimate_level_outputs_.Cleanup();
+ compaction_outputs_.Cleanup();
+
+ if (!status.ok()) {
+ for (const auto& out : GetOutputs()) {
+ // If this file was inserted into the table cache then remove
+ // them here because this compaction was not committed.
+ TableCache::Evict(cache, out.meta.fd.GetNumber());
+ }
+ }
+ // TODO: sub_compact.io_status is not checked like status. Not sure if thats
+ // intentional. So ignoring the io_status as of now.
+ io_status.PermitUncheckedError();
+}
+
+Slice SubcompactionState::SmallestUserKey() const {
+ if (has_penultimate_level_outputs_) {
+ Slice a = compaction_outputs_.SmallestUserKey();
+ Slice b = penultimate_level_outputs_.SmallestUserKey();
+ if (a.empty()) {
+ return b;
+ }
+ if (b.empty()) {
+ return a;
+ }
+ const Comparator* user_cmp =
+ compaction->column_family_data()->user_comparator();
+ if (user_cmp->Compare(a, b) > 0) {
+ return b;
+ } else {
+ return a;
+ }
+ } else {
+ return compaction_outputs_.SmallestUserKey();
+ }
+}
+
+Slice SubcompactionState::LargestUserKey() const {
+ if (has_penultimate_level_outputs_) {
+ Slice a = compaction_outputs_.LargestUserKey();
+ Slice b = penultimate_level_outputs_.LargestUserKey();
+ if (a.empty()) {
+ return b;
+ }
+ if (b.empty()) {
+ return a;
+ }
+ const Comparator* user_cmp =
+ compaction->column_family_data()->user_comparator();
+ if (user_cmp->Compare(a, b) < 0) {
+ return b;
+ } else {
+ return a;
+ }
+ } else {
+ return compaction_outputs_.LargestUserKey();
+ }
+}
+
+Status SubcompactionState::AddToOutput(
+ const CompactionIterator& iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ // update target output first
+ is_current_penultimate_level_ = iter.output_to_penultimate_level();
+ current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_
+ : &compaction_outputs_;
+ if (is_current_penultimate_level_) {
+ has_penultimate_level_outputs_ = true;
+ }
+
+ return Current().AddToOutput(iter, open_file_func, close_file_func);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.h b/src/rocksdb/db/compaction/subcompaction_state.h
new file mode 100644
index 000000000..13e63120f
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.h
@@ -0,0 +1,214 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <optional>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state and outputs for each sub-compaction
+// It contains 2 `CompactionOutputs`:
+// 1. one for the normal output files
+// 2. another for the penultimate level outputs
+// a `current` pointer maintains the current output group, when calling
+// `AddToOutput()`, it checks the output of the current compaction_iterator key
+// and point `current` to the target output group. By default, it just points to
+// normal compaction_outputs, if the compaction_iterator key should be placed on
+// the penultimate level, `current` is changed to point to
+// `penultimate_level_outputs`.
+// The later operations uses `Current()` to get the target group.
+//
+// +----------+ +-----------------------------+ +---------+
+// | *current |--------> | compaction_outputs |----->| output |
+// +----------+ +-----------------------------+ +---------+
+// | | output |
+// | +---------+
+// | | ... |
+// |
+// | +-----------------------------+ +---------+
+// +-------------> | penultimate_level_outputs |----->| output |
+// +-----------------------------+ +---------+
+// | ... |
+
+class SubcompactionState {
+ public:
+ const Compaction* compaction;
+
+ // The boundaries of the key-range this compaction is interested in. No two
+ // sub-compactions may have overlapping key-ranges.
+ // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+ const std::optional<Slice> start, end;
+
+ // The return status of this sub-compaction
+ Status status;
+
+ // The return IO Status of this sub-compaction
+ IOStatus io_status;
+
+ // Notify on sub-compaction completion only if listener was notified on
+ // sub-compaction begin.
+ bool notify_on_subcompaction_completion = false;
+
+ // compaction job stats for this sub-compaction
+ CompactionJobStats compaction_job_stats;
+
+ // sub-compaction job id, which is used to identify different sub-compaction
+ // within the same compaction job.
+ const uint32_t sub_job_id;
+
+ Slice SmallestUserKey() const;
+
+ Slice LargestUserKey() const;
+
+ // Get all outputs from the subcompaction. For per_key_placement compaction,
+ // it returns both the last level outputs and penultimate level outputs.
+ OutputIterator GetOutputs() const;
+
+ // Assign range dels aggregator, for each range_del, it can only be assigned
+ // to one output level, for per_key_placement, it's going to be the
+ // penultimate level.
+ void AssignRangeDelAggregator(
+ std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+ if (compaction->SupportsPerKeyPlacement()) {
+ penultimate_level_outputs_.AssignRangeDelAggregator(
+ std::move(range_del_agg));
+ } else {
+ compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg));
+ }
+ }
+
+ void RemoveLastEmptyOutput() {
+ compaction_outputs_.RemoveLastEmptyOutput();
+ penultimate_level_outputs_.RemoveLastEmptyOutput();
+ }
+
+#ifndef ROCKSDB_LITE
+ void BuildSubcompactionJobInfo(
+ SubcompactionJobInfo& subcompaction_job_info) const {
+ const Compaction* c = compaction;
+ const ColumnFamilyData* cfd = c->column_family_data();
+
+ subcompaction_job_info.cf_id = cfd->GetID();
+ subcompaction_job_info.cf_name = cfd->GetName();
+ subcompaction_job_info.status = status;
+ subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
+ subcompaction_job_info.base_input_level = c->start_level();
+ subcompaction_job_info.output_level = c->output_level();
+ subcompaction_job_info.stats = compaction_job_stats;
+ }
+#endif // !ROCKSDB_LITE
+
+ SubcompactionState() = delete;
+ SubcompactionState(const SubcompactionState&) = delete;
+ SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+ SubcompactionState(Compaction* c, const std::optional<Slice> _start,
+ const std::optional<Slice> _end, uint32_t _sub_job_id)
+ : compaction(c),
+ start(_start),
+ end(_end),
+ sub_job_id(_sub_job_id),
+ compaction_outputs_(c, /*is_penultimate_level=*/false),
+ penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
+ assert(compaction != nullptr);
+ // Set output split key (used for RoundRobin feature) only for normal
+ // compaction_outputs, output to penultimate_level feature doesn't support
+ // RoundRobin feature (and may never going to be supported, because for
+ // RoundRobin, the data time is mostly naturally sorted, no need to have
+ // per-key placement with output_to_penultimate_level).
+ compaction_outputs_.SetOutputSlitKey(start, end);
+ }
+
+ SubcompactionState(SubcompactionState&& state) noexcept
+ : compaction(state.compaction),
+ start(state.start),
+ end(state.end),
+ status(std::move(state.status)),
+ io_status(std::move(state.io_status)),
+ notify_on_subcompaction_completion(
+ state.notify_on_subcompaction_completion),
+ compaction_job_stats(std::move(state.compaction_job_stats)),
+ sub_job_id(state.sub_job_id),
+ compaction_outputs_(std::move(state.compaction_outputs_)),
+ penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
+ is_current_penultimate_level_(state.is_current_penultimate_level_),
+ has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) {
+ current_outputs_ = is_current_penultimate_level_
+ ? &penultimate_level_outputs_
+ : &compaction_outputs_;
+ }
+
+ bool HasPenultimateLevelOutputs() const {
+ return has_penultimate_level_outputs_ ||
+ penultimate_level_outputs_.HasRangeDel();
+ }
+
+ bool IsCurrentPenultimateLevel() const {
+ return is_current_penultimate_level_;
+ }
+
+ // Add all the new files from this compaction to version_edit
+ void AddOutputsEdit(VersionEdit* out_edit) const {
+ for (const auto& file : penultimate_level_outputs_.outputs_) {
+ out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
+ }
+ for (const auto& file : compaction_outputs_.outputs_) {
+ out_edit->AddFile(compaction->output_level(), file.meta);
+ }
+ }
+
+ void Cleanup(Cache* cache);
+
+ void AggregateCompactionStats(
+ InternalStats::CompactionStatsFull& compaction_stats) const;
+
+ CompactionOutputs& Current() const {
+ assert(current_outputs_);
+ return *current_outputs_;
+ }
+
+ // Add compaction_iterator key/value to the `Current` output group.
+ Status AddToOutput(const CompactionIterator& iter,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func);
+
+ // Close all compaction output files, both output_to_penultimate_level outputs
+ // and normal outputs.
+ Status CloseCompactionFiles(const Status& curr_status,
+ const CompactionFileOpenFunc& open_file_func,
+ const CompactionFileCloseFunc& close_file_func) {
+ // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+ // close the output file.
+ Status s = penultimate_level_outputs_.CloseOutput(
+ curr_status, open_file_func, close_file_func);
+ s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func);
+ return s;
+ }
+
+ private:
+ // State kept for output being generated
+ CompactionOutputs compaction_outputs_;
+ CompactionOutputs penultimate_level_outputs_;
+ CompactionOutputs* current_outputs_ = &compaction_outputs_;
+ bool is_current_penultimate_level_ = false;
+ bool has_penultimate_level_outputs_ = false;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/tiered_compaction_test.cc b/src/rocksdb/db/compaction/tiered_compaction_test.cc
new file mode 100644
index 000000000..aaebcfd94
--- /dev/null
+++ b/src/rocksdb/db/compaction/tiered_compaction_test.cc
@@ -0,0 +1,2028 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#if !defined(ROCKSDB_LITE)
+
+class TieredCompactionTest : public DBTestBase,
+ public testing::WithParamInterface<bool> {
+ public:
+ TieredCompactionTest()
+ : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true),
+ kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1),
+ kBasicPerKeyPlacementCompStats(
+ CompactionReason::kUniversalSizeAmplification, 1),
+ kBasicFlushStats(CompactionReason::kFlush, 1) {
+ kBasicCompStats.micros = kHasValue;
+ kBasicCompStats.cpu_micros = kHasValue;
+ kBasicCompStats.bytes_read_non_output_levels = kHasValue;
+ kBasicCompStats.num_input_files_in_non_output_levels = kHasValue;
+ kBasicCompStats.num_input_records = kHasValue;
+ kBasicCompStats.num_dropped_records = kHasValue;
+
+ kBasicPerLevelStats.num_output_records = kHasValue;
+ kBasicPerLevelStats.bytes_written = kHasValue;
+ kBasicPerLevelStats.num_output_files = kHasValue;
+
+ kBasicPerKeyPlacementCompStats.micros = kHasValue;
+ kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue;
+ kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats);
+
+ kBasicFlushStats.micros = kHasValue;
+ kBasicFlushStats.cpu_micros = kHasValue;
+ kBasicFlushStats.bytes_written = kHasValue;
+ kBasicFlushStats.num_output_files = kHasValue;
+ }
+
+ protected:
+ static constexpr uint8_t kHasValue = 1;
+
+ InternalStats::CompactionStats kBasicCompStats;
+ InternalStats::CompactionStats kBasicPerKeyPlacementCompStats;
+ InternalStats::CompactionOutputsStats kBasicPerLevelStats;
+ InternalStats::CompactionStats kBasicFlushStats;
+
+ std::atomic_bool enable_per_key_placement = true;
+
+ void SetUp() override {
+ SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+ auto supports_per_key_placement = static_cast<bool*>(arg);
+ *supports_per_key_placement = enable_per_key_placement;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ return internal_stats->TEST_GetCompactionStats();
+ }
+
+ const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ return internal_stats->TEST_GetPerKeyPlacementCompactionStats();
+ }
+
+ // Verify the compaction stats, the stats are roughly compared
+ void VerifyCompactionStats(
+ const std::vector<InternalStats::CompactionStats>& expect_stats,
+ const InternalStats::CompactionStats& expect_pl_stats) {
+ const std::vector<InternalStats::CompactionStats>& stats =
+ GetCompactionStats();
+ const size_t kLevels = expect_stats.size();
+ ASSERT_EQ(kLevels, stats.size());
+
+ for (auto it = stats.begin(), expect = expect_stats.begin();
+ it != stats.end(); it++, expect++) {
+ VerifyCompactionStats(*it, *expect);
+ }
+
+ const InternalStats::CompactionStats& pl_stats =
+ GetPerKeyPlacementCompactionStats();
+ VerifyCompactionStats(pl_stats, expect_pl_stats);
+ }
+
+ void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats,
+ InternalStats::CompactionStats& pl_stats) {
+ ASSERT_OK(dbfull()->ResetStats());
+ for (auto& level_stats : stats) {
+ level_stats.Clear();
+ }
+ pl_stats.Clear();
+ }
+
+ // bottommost_temperature is renaming to last_level_temperature, set either
+ // of them should have the same effect.
+ void SetColdTemperature(Options& options) {
+ if (GetParam()) {
+ options.bottommost_temperature = Temperature::kCold;
+ } else {
+ options.last_level_temperature = Temperature::kCold;
+ }
+ }
+
+ private:
+ void CompareStats(uint64_t val, uint64_t expect) {
+ if (expect > 0) {
+ ASSERT_TRUE(val > 0);
+ } else {
+ ASSERT_EQ(val, 0);
+ }
+ }
+
+ void VerifyCompactionStats(
+ const InternalStats::CompactionStats& stats,
+ const InternalStats::CompactionStats& expect_stats) {
+ CompareStats(stats.micros, expect_stats.micros);
+ CompareStats(stats.cpu_micros, expect_stats.cpu_micros);
+ CompareStats(stats.bytes_read_non_output_levels,
+ expect_stats.bytes_read_non_output_levels);
+ CompareStats(stats.bytes_read_output_level,
+ expect_stats.bytes_read_output_level);
+ CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob);
+ CompareStats(stats.bytes_written, expect_stats.bytes_written);
+ CompareStats(stats.bytes_moved, expect_stats.bytes_moved);
+ CompareStats(stats.num_input_files_in_non_output_levels,
+ expect_stats.num_input_files_in_non_output_levels);
+ CompareStats(stats.num_input_files_in_output_level,
+ expect_stats.num_input_files_in_output_level);
+ CompareStats(stats.num_output_files, expect_stats.num_output_files);
+ CompareStats(stats.num_output_files_blob,
+ expect_stats.num_output_files_blob);
+ CompareStats(stats.num_input_records, expect_stats.num_input_records);
+ CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records);
+ CompareStats(stats.num_output_records, expect_stats.num_output_records);
+ ASSERT_EQ(stats.count, expect_stats.count);
+ for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons);
+ i++) {
+ ASSERT_EQ(stats.counts[i], expect_stats.counts[i]);
+ }
+ }
+};
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+ std::vector<SequenceNumber> seq_history;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+ InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+ InternalStats::CompactionStats expect_pl_stats;
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+ expect_stats[0].Add(kBasicFlushStats);
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // the penultimate level file temperature is not cold, all data are output to
+ // the penultimate level.
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // basic compaction stats are still counted to the last level
+ expect_stats[kLastLevel].Add(kBasicCompStats);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // move forward the cold_seq to split the file into 2 levels, so should have
+ // both the last level stats and the output_to_penultimate_level stats
+ latest_cold_seq = seq_history[0];
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.num_dropped_records = 0;
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // delete all cold data, so all data will be on penultimate level
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // move forward the cold_seq again with range delete, take a snapshot to keep
+ // the range dels in both cold and hot SSTs
+ auto snap = db_->GetSnapshot();
+ latest_cold_seq = seq_history[2];
+ std::string start = Key(25), end = Key(35);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // verify data
+ std::string value;
+ for (int i = 0; i < kNumKeys; i++) {
+ if (i < 10 || (i >= 25 && i < 35)) {
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+ }
+ }
+
+ // range delete all hot data
+ start = Key(30);
+ end = Key(130);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped because of snapshot
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ // release the snapshot and do compaction again should remove all hot data
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // 2 range dels are dropped
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 3);
+
+ // move backward the cold_seq, for example the user may change the setting of
+ // hot/cold data, but it won't impact the existing cold data, as the sequence
+ // number is zeroed out.
+ latest_cold_seq = seq_history[1];
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+ auto cmp = options.comparator;
+
+ port::Mutex mutex;
+ std::string hot_start = Key(10);
+ std::string hot_end = Key(50);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ MutexLock l(&mutex);
+ context->output_to_penultimate_level =
+ cmp->Compare(context->key, hot_start) >= 0 &&
+ cmp->Compare(context->key, hot_end) < 0;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+ InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+ InternalStats::CompactionStats expect_pl_stats;
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+ }
+ ASSERT_OK(Flush());
+ expect_stats[0].Add(kBasicFlushStats);
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.Add(kBasicPerLevelStats);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // change to all cold, no output_to_penultimate_level output
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(100);
+ hot_end = Key(200);
+ }
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.num_dropped_records = 0;
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // change to all hot, universal compaction support moving data to up level if
+ // it's within compaction level range.
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(0);
+ hot_end = Key(100);
+ }
+
+ // No data is moved from cold tier to hot tier because no input files from L5
+ // or higher, it's not safe to move data to output_to_penultimate_level level.
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+ // Add 2 keys in higher level, but in separated files, all keys can be moved
+ // up if it's hot
+ ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Key(50), "value" + std::to_string(0)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // change to only 1 key cold, to test compaction could stop even it matches
+ // size amp compaction threshold
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(1);
+ hot_end = Key(1000);
+ }
+
+ // generate files just enough to trigger compaction
+ for (int i = 0; i < kNumTrigger - 1; i++) {
+ for (int j = 0; j < 1000; j++) {
+ ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(
+ true)); // make sure the compaction is able to finish
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ auto opts = db_->GetOptions();
+ auto max_size_amp =
+ opts.compaction_options_universal.max_size_amplification_percent / 100;
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown),
+ GetSstSizeHelper(Temperature::kCold) * max_size_amp);
+
+ // delete all cold data
+ ASSERT_OK(Delete(Key(0)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // range delete overlap with both hot/cold data, with a snapshot to make sure
+ // the range del is saved
+ auto snap = db_->GetSnapshot();
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(50);
+ hot_end = Key(100);
+ }
+ std::string start = Key(1), end = Key(70);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped until snapshot is released
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ // verify data
+ std::string value;
+ for (int i = 0; i < kNumKeys; i++) {
+ if (i < 70) {
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+ }
+ }
+
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // range del is dropped
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 1);
+}
+
+TEST_P(TieredCompactionTest, LevelColdRangeDelete) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,1",
+ FilesPerLevel()); // bottommost but not last level file is hot
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // explicitly move the data to the last level
+ MoveFilesToLevel(kLastLevel);
+
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ auto snap = db_->GetSnapshot();
+
+ std::string start = Key(10);
+ std::string end = Key(50);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+
+ // 20->30 will be marked as cold data, but it cannot be placed to cold tier
+ // (bottommost) otherwise, it will be "deleted" by the range del in
+ // output_to_penultimate_level level verify that these data will be able to
+ // queried
+ for (int i = 20; i < 30; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ // make the range tombstone and data after that cold
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+
+ // add home hot data, just for test
+ for (int i = 30; i < 40; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ std::string value;
+ for (int i = 0; i < kNumKeys; i++) {
+ auto s = db_->Get(ReadOptions(), Key(i), &value);
+ if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) {
+ ASSERT_TRUE(s.IsNotFound());
+ } else {
+ ASSERT_OK(s);
+ }
+ }
+
+ db_->ReleaseSnapshot(snap);
+}
+
+// Test SST partitioner cut after every single key
+class SingleKeySstPartitioner : public SstPartitioner {
+ public:
+ const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+ PartitionerResult ShouldPartition(
+ const PartitionerRequest& /*request*/) override {
+ return kRequired;
+ }
+
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+ const Slice& /*largest_user_key*/) override {
+ return false;
+ }
+};
+
+class SingleKeySstPartitionerFactory : public SstPartitionerFactory {
+ public:
+ static const char* kClassName() { return "SingleKeySstPartitionerFactory"; }
+ const char* Name() const override { return kClassName(); }
+
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const override {
+ return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner());
+ }
+};
+
+TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 3;
+ const int kNumKeys = 10;
+
+ auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.sst_partitioner_factory = factory;
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ MoveFilesToLevel(kNumLevels - 1);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ("0,0,10", FilesPerLevel());
+
+ auto snap = db_->GetSnapshot();
+
+ // only range delete
+ std::string start = Key(3);
+ std::string end = Key(5);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown),
+ 0); // tombstone has no size, even it's in hot tier
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_EQ("0,1,10",
+ FilesPerLevel()); // one file is at the penultimate level which
+ // only contains a range delete
+
+ // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+ // range del, but they don't have overlap with range del, make sure the range
+ // del will still be placed there
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Put(Key(0), "new value" + std::to_string(0)));
+ auto snap2 = db_->GetSnapshot();
+ ASSERT_OK(Put(Key(6), "new value" + std::to_string(6)));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,2,10",
+ FilesPerLevel()); // one file is at the penultimate level
+ // which only contains a range delete
+ std::vector<LiveFileMetaData> live_file_meta;
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ bool found_sst_with_del = false;
+ uint64_t sst_with_del_num = 0;
+ for (const auto& meta : live_file_meta) {
+ if (meta.num_deletions > 0) {
+ // found SST with del, which has 2 entries, one for data one for range del
+ ASSERT_EQ(meta.level,
+ kNumLevels - 2); // output to penultimate level
+ ASSERT_EQ(meta.num_entries, 2);
+ ASSERT_EQ(meta.num_deletions, 1);
+ found_sst_with_del = true;
+ sst_with_del_num = meta.file_number;
+ }
+ }
+ ASSERT_TRUE(found_sst_with_del);
+
+ // release the first snapshot and compact, which should compact the range del
+ // but new inserted key `0` and `6` are still hot data which will be placed on
+ // the penultimate level
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,2,7", FilesPerLevel());
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ found_sst_with_del = false;
+ for (const auto& meta : live_file_meta) {
+ // check new SST with del (the old one may not yet be deleted after
+ // compaction)
+ if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+ found_sst_with_del = true;
+ }
+ }
+ ASSERT_FALSE(found_sst_with_del);
+
+ // Now make all data cold, key 0 will be moved to the last level, but key 6 is
+ // still in snap2, so it will be kept at the penultimate level
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,1,8", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ db_->ReleaseSnapshot(snap2);
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,8", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, UniversalRangeDelete) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 10;
+
+ auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+
+ auto options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.statistics = CreateDBStatistics();
+ options.sst_partitioner_factory = factory;
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+
+ // compact to the penultimate level with 10 files
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // make all data cold
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // range del which considered as hot data, but it will be merged and deleted
+ // with the last level data
+ std::string start = Key(3);
+ std::string end = Key(5);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel());
+
+ // range del with snapshot should be preserved in the penultimate level
+ auto snap = db_->GetSnapshot();
+
+ start = Key(6);
+ end = Key(8);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel());
+
+ // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+ // range del, but no overlap with range del.
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(Put(Key(4), "new value" + std::to_string(0)));
+ auto snap2 = db_->GetSnapshot();
+ ASSERT_OK(Put(Key(9), "new value" + std::to_string(6)));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel());
+ // find the SST with range del
+ std::vector<LiveFileMetaData> live_file_meta;
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ bool found_sst_with_del = false;
+ uint64_t sst_with_del_num = 0;
+ for (const auto& meta : live_file_meta) {
+ if (meta.num_deletions > 0) {
+ // found SST with del, which has 2 entries, one for data one for range del
+ ASSERT_EQ(meta.level,
+ kNumLevels - 2); // output_to_penultimate_level level
+ ASSERT_EQ(meta.num_entries, 2);
+ ASSERT_EQ(meta.num_deletions, 1);
+ found_sst_with_del = true;
+ sst_with_del_num = meta.file_number;
+ }
+ }
+ ASSERT_TRUE(found_sst_with_del);
+
+ // release the first snapshot which should compact the range del, but data on
+ // the same level is still hot
+ db_->ReleaseSnapshot(snap);
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel());
+ db_->GetLiveFilesMetaData(&live_file_meta);
+ // no range del should be found in SST
+ found_sst_with_del = false;
+ for (const auto& meta : live_file_meta) {
+ // check new SST with del (the old one may not yet be deleted after
+ // compaction)
+ if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+ found_sst_with_del = true;
+ }
+ }
+ ASSERT_FALSE(found_sst_with_del);
+
+ // make all data to cold, but key 6 is still protected by snap2
+ latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ db_->ReleaseSnapshot(snap2);
+
+ // release snapshot, everything go to bottommost
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kLastLevel = kNumLevels - 1;
+
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+
+ std::atomic_uint64_t latest_cold_seq = 0;
+ std::vector<SequenceNumber> seq_history;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ context->output_to_penultimate_level =
+ context->seq_num > latest_cold_seq;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+ InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+ InternalStats::CompactionStats expect_pl_stats;
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ expect_stats[0].Add(kBasicFlushStats);
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // non last level is hot
+ ASSERT_EQ("0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ expect_stats[1].Add(kBasicCompStats);
+ expect_stats[1].Add(kBasicPerLevelStats);
+ expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // move all data to the last level
+ MoveFilesToLevel(kLastLevel);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // The compaction won't move the data up
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.Add(kBasicPerLevelStats);
+ last_stats.num_dropped_records = 0;
+ last_stats.bytes_read_non_output_levels = 0;
+ last_stats.num_input_files_in_non_output_levels = 0;
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // Add new data, which is all hot and overriding all existing data
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ ResetAllStats(expect_stats, expect_pl_stats);
+
+ // after compaction, all data are hot
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ for (int level = 2; level < kNumLevels - 1; level++) {
+ expect_stats[level].bytes_moved = kHasValue;
+ }
+
+ last_stats.Add(kBasicCompStats);
+ last_stats.bytes_read_output_level = kHasValue;
+ last_stats.num_input_files_in_output_level = kHasValue;
+ last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+ expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+ VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+ // move forward the cold_seq, try to split the data into cold and hot, but in
+ // this case it's unsafe to split the data
+ // because it's non-last-level but bottommost file, the sequence number will
+ // be zeroed out and lost the time information (with
+ // `level_compaction_dynamic_level_bytes` or Universal Compaction, it should
+ // be rare.)
+ // TODO(zjay): ideally we should avoid zero out non-last-level bottommost file
+ latest_cold_seq = seq_history[1];
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ seq_history.clear();
+
+ // manually move all data (cold) to last level
+ MoveFilesToLevel(kLastLevel);
+ seq_history.clear();
+ // Add new data once again
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ latest_cold_seq = seq_history[0];
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // delete all cold data
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Delete(Key(i)));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+ latest_cold_seq = seq_history[2];
+
+ MoveFilesToLevel(kLastLevel);
+
+ // move forward the cold_seq again with range delete, take a snapshot to keep
+ // the range dels in bottommost
+ auto snap = db_->GetSnapshot();
+
+ std::string start = Key(25), end = Key(35);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ // add one small key and large key in the input level, to make sure it's able
+ // to move hot data to input level within that range
+ ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+ ASSERT_OK(Put(Key(100), "value" + std::to_string(0)));
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // verify data
+ std::string value;
+ for (int i = 1; i < 130; i++) {
+ if (i < 10 || (i >= 25 && i < 35)) {
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+ } else {
+ ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+ }
+ }
+
+ // delete all hot data
+ ASSERT_OK(Delete(Key(0)));
+ start = Key(30);
+ end = Key(101); // range [101, 130] is cold, because it's not in input range
+ // in previous compaction
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped because of snapshot
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ db_->ReleaseSnapshot(snap);
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // 3 range dels dropped, the first one is double counted as expected, which is
+ // spread into 2 SST files
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 3);
+
+ // move backward of cold_seq, which might happen when the user change the
+ // setting. the hot data won't move up, just to make sure it still runs
+ // fine, which is because:
+ // 1. sequence number is zeroed out, so no time information
+ // 2. leveled compaction only support move data up within the higher level
+ // input range
+ latest_cold_seq = seq_history[1];
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+
+ auto options = CurrentOptions();
+ SetColdTemperature(options);
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.level_compaction_dynamic_level_bytes = true;
+ options.num_levels = kNumLevels;
+ options.statistics = CreateDBStatistics();
+ options.max_subcompactions = 10;
+ DestroyAndReopen(options);
+ auto cmp = options.comparator;
+
+ port::Mutex mutex;
+ std::string hot_start = Key(10);
+ std::string hot_end = Key(50);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
+ MutexLock l(&mutex);
+ context->output_to_penultimate_level =
+ cmp->Compare(context->key, hot_start) >= 0 &&
+ cmp->Compare(context->key, hot_end) < 0;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int i = 0; i < kNumTrigger; i++) {
+ for (int j = 0; j < kNumKeys; j++) {
+ ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // change to all cold
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(100);
+ hot_end = Key(200);
+ }
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // change to all hot, but level compaction only support move cold to hot
+ // within it's higher level input range.
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(0);
+ hot_end = Key(100);
+ }
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // with mixed hot/cold data
+ {
+ MutexLock l(&mutex);
+ hot_start = Key(50);
+ hot_end = Key(100);
+ }
+ ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+ ASSERT_OK(Put(Key(100), "value" + std::to_string(100)));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // delete all hot data, but with snapshot to keep the range del
+ auto snap = db_->GetSnapshot();
+ std::string start = Key(50);
+ std::string end = Key(100);
+ ASSERT_OK(
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ // no range del is dropped because of snapshot
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 0);
+
+ // release the snapshot and do compaction again should remove all hot data
+ db_->ReleaseSnapshot(snap);
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ ASSERT_EQ(
+ options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+ 1);
+}
+
+INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest,
+ testing::Bool());
+
+class PrecludeLastLevelTest : public DBTestBase {
+ public:
+ PrecludeLastLevelTest()
+ : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) {
+ mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+ mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+ }
+
+ protected:
+ std::unique_ptr<Env> mock_env_;
+ std::shared_ptr<MockSystemClock> mock_clock_;
+
+ void SetUp() override {
+ mock_clock_->InstallTimedWaitFixCallback();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+ auto periodic_task_scheduler_ptr =
+ reinterpret_cast<PeriodicTaskScheduler*>(arg);
+ periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+ });
+ mock_clock_->SetCurrentTime(0);
+ }
+};
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 10000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ // all data is hot, even they're in the last level
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // Generate a sstable and trigger manual compaction
+ ASSERT_OK(Put(Key(10), "value"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // all data is moved up to the penultimate level
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // close explicitly, because the env is local variable which will be released
+ // first.
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 10000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ // make sure it won't trigger Size Amp compaction, unlike normal Size Amp
+ // compaction which is typically a last level compaction, when tiered Storage
+ // ("preclude_last_level") is enabled, size amp won't include the last level.
+ // As the last level would be in cold tier and the size would not be a
+ // problem, which also avoid frequent hot to cold storage compaction.
+ options.compaction_options_universal.max_size_amplification_percent = 400;
+ Reopen(options);
+
+ // all data is hot, even they're in the last level
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // Write more data, but still all hot until the 10th SST, as:
+ // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+ // The preclude_last_level_data_seconds is 10k
+ Random rnd(301);
+ for (; sst_num < kNumTrigger * 2 - 1; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ // the value needs to be big enough to trigger full compaction
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+ }
+
+ // all data is moved up to the penultimate level
+ ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ // close explicitly, because the env is local variable which will be released
+ // first.
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 2000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+
+ // make sure there're more than 300 keys and first 100 keys are having seqno
+ // zeroed out, the last 100 key seqno not zeroed out
+ ASSERT_GT(key_versions.size(), 300);
+ for (int i = 0; i < 100; i++) {
+ ASSERT_EQ(key_versions[i].sequence, 0);
+ }
+ auto rit = key_versions.rbegin();
+ for (int i = 0; i < 100; i++) {
+ ASSERT_GT(rit->sequence, 0);
+ rit++;
+ }
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 2000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ // Generate a sstable and trigger manual compaction
+ ASSERT_OK(Put(Key(10), "value"));
+ ASSERT_OK(Flush());
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // some data are moved up, some are not
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preclude_last_level_data_seconds = 60;
+ options.preserve_internal_time_seconds = 0;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ options.last_level_temperature = Temperature::kCold;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(10) + 1));
+ });
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(2)));
+ });
+ }
+ ASSERT_OK(Flush());
+
+ TablePropertiesCollection tables_props;
+ ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+ ASSERT_EQ(tables_props.size(), 1);
+ ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty());
+ SeqnoToTimeMapping tp_mapping;
+ ASSERT_OK(
+ tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping));
+ ASSERT_OK(tp_mapping.Sort());
+ ASSERT_FALSE(tp_mapping.Empty());
+ auto seqs = tp_mapping.TEST_GetInternalMapping();
+ ASSERT_FALSE(seqs.empty());
+
+ // Wait more than preclude_last_level time, then make sure all the data is
+ // compacted to the last level even there's no write (no seqno -> time
+ // information was flushed to any SST).
+ mock_clock_->MockSleepForSeconds(100);
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+ ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ Close();
+}
+
+TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 2000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // enable preclude feature
+ options.preclude_last_level_data_seconds = 2000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // some data are moved up, some are not
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+ std::vector<KeyVersion> key_versions;
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+ std::numeric_limits<size_t>::max(),
+ &key_versions));
+
+ // make sure there're more than 300 keys and first 100 keys are having seqno
+ // zeroed out, the last 100 key seqno not zeroed out
+ ASSERT_GT(key_versions.size(), 300);
+ for (int i = 0; i < 100; i++) {
+ ASSERT_EQ(key_versions[i].sequence, 0);
+ }
+ auto rit = key_versions.rbegin();
+ for (int i = 0; i < 100; i++) {
+ ASSERT_GT(rit->sequence, 0);
+ rit++;
+ }
+
+ Close();
+}
+
+class PrecludeLastLevelTestWithParms
+ : public PrecludeLastLevelTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ PrecludeLastLevelTestWithParms() : PrecludeLastLevelTest() {}
+};
+
+TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kNumKeys = 100;
+ const int kKeyPerSec = 10;
+
+ bool enable_preclude_last_level = GetParam();
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.preserve_internal_time_seconds = 2000;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+ Random rnd(301);
+ int sst_num = 0;
+ // Write files that are overlap and enough to trigger compaction
+ for (; sst_num < kNumTrigger; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // all data is pushed to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ std::atomic_bool is_manual_compaction_running = false;
+ std::atomic_bool verified_compaction_order = false;
+
+ // Make sure the manual compaction is in progress and try to trigger a
+ // SizeRatio compaction by flushing 4 files to L0. The compaction will try to
+ // compact 4 files at L0 to L5 (the last empty level).
+ // If the preclude_last_feature is enabled, the auto triggered compaction
+ // cannot be picked. Otherwise, the auto triggered compaction can run in
+ // parallel with the last level compaction.
+ // L0: [a] [b] [c] [d]
+ // L5: (locked if preclude_last_level is enabled)
+ // L6: [z] (locked: manual compaction in progress)
+ // TODO: in this case, L0 files should just be compacted to L4, so the 2
+ // compactions won't be overlapped.
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+ auto compaction = static_cast<Compaction*>(arg);
+ if (compaction->is_manual_compaction()) {
+ is_manual_compaction_running = true;
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction1");
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction2");
+ is_manual_compaction_running = false;
+ }
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ auto compaction = static_cast<Compaction*>(arg);
+ if (enable_preclude_last_level && is_manual_compaction_running) {
+ ASSERT_TRUE(compaction == nullptr);
+ verified_compaction_order = true;
+ } else {
+ ASSERT_TRUE(compaction != nullptr);
+ verified_compaction_order = true;
+ }
+ if (!compaction || !compaction->is_manual_compaction()) {
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "AutoCompactionPicked");
+ }
+ });
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction1",
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"},
+ {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "AutoCompactionPicked",
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+ "ManualCompaction2"},
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // only enable if the Parameter is true
+ if (enable_preclude_last_level) {
+ options.preclude_last_level_data_seconds = 2000;
+ }
+ options.max_background_jobs = 8;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ cro.exclusive_manual_compaction = false;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ });
+
+ TEST_SYNC_POINT(
+ "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite");
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ for (; sst_num < kNumTrigger * 2; sst_num++) {
+ for (int i = 0; i < kNumKeys; i++) {
+ // the value needs to be big enough to trigger full compaction
+ ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+ dbfull()->TEST_WaitForPeridicTaskRun([&] {
+ mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+ });
+ }
+ ASSERT_OK(Flush());
+ }
+
+ manual_compaction_thread.join();
+
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ if (enable_preclude_last_level) {
+ ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel());
+ } else {
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+ }
+ ASSERT_TRUE(verified_compaction_order);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ stop_token.reset();
+
+ Close();
+}
+
+INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms,
+ PrecludeLastLevelTestWithParms, testing::Bool());
+
+// partition the SST into 3 ranges [0, 19] [20, 39] [40, ...]
+class ThreeRangesPartitioner : public SstPartitioner {
+ public:
+ const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+ PartitionerResult ShouldPartition(
+ const PartitionerRequest& request) override {
+ if ((cmp->CompareWithoutTimestamp(*request.current_user_key,
+ DBTestBase::Key(20)) >= 0 &&
+ cmp->CompareWithoutTimestamp(*request.prev_user_key,
+ DBTestBase::Key(20)) < 0) ||
+ (cmp->CompareWithoutTimestamp(*request.current_user_key,
+ DBTestBase::Key(40)) >= 0 &&
+ cmp->CompareWithoutTimestamp(*request.prev_user_key,
+ DBTestBase::Key(40)) < 0)) {
+ return kRequired;
+ } else {
+ return kNotRequired;
+ }
+ }
+
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+ const Slice& /*largest_user_key*/) override {
+ return false;
+ }
+
+ const Comparator* cmp = BytewiseComparator();
+};
+
+class ThreeRangesPartitionerFactory : public SstPartitionerFactory {
+ public:
+ static const char* kClassName() {
+ return "TombstoneTestSstPartitionerFactory";
+ }
+ const char* Name() const override { return kClassName(); }
+
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
+ const SstPartitioner::Context& /* context */) const override {
+ return std::unique_ptr<SstPartitioner>(new ThreeRangesPartitioner());
+ }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.preserve_internal_time_seconds = 10000;
+ options.num_levels = kNumLevels;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+ Random rnd(301);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+ }
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // make sure all data is compacted to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // Create 3 L5 files
+ auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+ options.sst_partitioner_factory = factory;
+
+ Reopen(options);
+
+ for (int i = 0; i < kNumTrigger - 1; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ // L5: [0,19] [20,39] [40,299]
+ // L6: [0, 299]
+ ASSERT_EQ("0,0,0,0,0,3,1", FilesPerLevel());
+
+ // enable tiered storage feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ options.statistics = CreateDBStatistics();
+ Reopen(options);
+
+ ColumnFamilyMetaData meta;
+ db_->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[5].files.size(), 3);
+ ASSERT_EQ(meta.levels[6].files.size(), 1);
+ ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+ ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(299));
+
+ std::string file_path = meta.levels[5].files[1].db_path;
+ std::vector<std::string> files;
+ // pick 3rd file @L5 + file@L6 for compaction
+ files.push_back(file_path + "/" + meta.levels[5].files[2].name);
+ files.push_back(file_path + "/" + meta.levels[6].files[0].name);
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), files, 6));
+
+ // The compaction only moved partial of the hot data to hot tier, range[0,39]
+ // is unsafe to move up, otherwise, they will be overlapped with the existing
+ // files@L5.
+ // The output should be:
+ // L5: [0,19] [20,39] [40,299] <-- Temperature::kUnknown
+ // L6: [0,19] [20,39] <-- Temperature::kCold
+ // L6 file is split because of the customized partitioner
+ ASSERT_EQ("0,0,0,0,0,3,2", FilesPerLevel());
+
+ // even all the data is hot, but not all data are moved to the hot tier
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+ ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+ db_->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[5].files.size(), 3);
+ ASSERT_EQ(meta.levels[6].files.size(), 2);
+ for (const auto& file : meta.levels[5].files) {
+ ASSERT_EQ(file.temperature, Temperature::kUnknown);
+ }
+ for (const auto& file : meta.levels[6].files) {
+ ASSERT_EQ(file.temperature, Temperature::kCold);
+ }
+ ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+ ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(19));
+ ASSERT_EQ(meta.levels[6].files[1].smallestkey, Key(20));
+ ASSERT_EQ(meta.levels[6].files[1].largestkey, Key(39));
+
+ Close();
+}
+
+struct TestPropertiesCollector : public TablePropertiesCollector {
+ Status AddUserKey(const Slice& key, const Slice& /*value*/,
+ EntryType /*type*/, SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) override {
+ if (cmp->Compare(key, DBTestBase::Key(100)) == 0) {
+ has_key_100 = true;
+ }
+ if (cmp->Compare(key, DBTestBase::Key(200)) == 0) {
+ has_key_200 = true;
+ }
+
+ return Status::OK();
+ }
+
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ UserCollectedProperties ret;
+ return ret;
+ }
+
+ Status Finish(UserCollectedProperties* /*properties*/) override {
+ // The LSM tree would be like:
+ // L5: [0,19] [20,39] [40,299]
+ // L6: [0, 299]
+ // the 3rd file @L5 has both 100 and 200, which will be marked for
+ // compaction
+ // Also avoid marking flushed SST for compaction, which won't have both 100
+ // and 200
+ if (has_key_100 && has_key_200) {
+ need_compact_ = true;
+ } else {
+ need_compact_ = false;
+ }
+ has_key_100 = false;
+ has_key_200 = false;
+ return Status::OK();
+ }
+
+ bool NeedCompact() const override { return need_compact_; }
+
+ const Comparator* cmp = BytewiseComparator();
+
+ private:
+ bool has_key_100 = false;
+ bool has_key_200 = false;
+
+ bool need_compact_ = false;
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new TestPropertiesCollector;
+ }
+ const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) {
+ const int kNumTrigger = 4;
+ const int kNumLevels = 7;
+ const int kKeyPerSec = 10;
+
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.env = mock_env_.get();
+ options.level0_file_num_compaction_trigger = kNumTrigger;
+ options.preserve_internal_time_seconds = 10000;
+ options.num_levels = kNumLevels;
+ // set a small max_compaction_bytes to avoid input level expansion
+ options.max_compaction_bytes = 30000;
+ options.ignore_max_compaction_bytes_for_input = false;
+ DestroyAndReopen(options);
+
+ // pass some time first, otherwise the first a few keys write time are going
+ // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+ Random rnd(301);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+ dbfull()->TEST_WaitForPeridicTaskRun(
+ [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+ }
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+ // make sure all data is compacted to the last level
+ ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+ // Create 3 L5 files
+ auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+ options.sst_partitioner_factory = factory;
+
+ // the user defined properties_collector will mark the 3rd file for compaction
+ auto collector_factory = std::make_shared<TestPropertiesCollectorFactory>();
+ options.table_properties_collector_factories.resize(1);
+ options.table_properties_collector_factories[0] = collector_factory;
+ // enable tiered storage feature
+ options.preclude_last_level_data_seconds = 10000;
+ options.last_level_temperature = Temperature::kCold;
+ Reopen(options);
+
+ for (int i = 0; i < kNumTrigger - 2; i++) {
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ // make sure there is one and only one compaction supports per-key placement
+ // but has the penultimate level output disabled.
+ std::atomic_int per_key_comp_num = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+ auto compaction = static_cast<Compaction*>(arg);
+ if (compaction->SupportsPerKeyPlacement()) {
+ ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
+ Compaction::PenultimateOutputRangeType::kDisabled);
+ per_key_comp_num++;
+ }
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int j = 0; j < 100; j++) {
+ ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10)));
+ }
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+ Key(32), Key(40)));
+ ASSERT_OK(Flush());
+
+ // Before the per-key placement compaction, the LSM tress should be like:
+ // L5: [0,19] [20,40] [40,299]
+ // L6: [0, 299]
+ // The 2nd file @L5 has the largest key 40 because of range del
+
+ ASSERT_OK(dbfull()->WaitForCompact(true));
+
+ ASSERT_EQ(per_key_comp_num, 1);
+
+ // the compaction won't move any data to the penultimate level
+ ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel());
+
+ Close();
+}
+
+#endif // !defined(ROCKSDB_LITE)
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void)argc;
+ (void)argv;
+ return 0;
+#endif
+}