summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/db_compaction_test.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/rocksdb/db/db_compaction_test.cc
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/db/db_compaction_test.cc')
-rw-r--r--src/rocksdb/db/db_compaction_test.cc4361
1 files changed, 4361 insertions, 0 deletions
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 00000000..df51ef2c
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,4361 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/utilities/convenience.h"
+#include "util/concurrent_task_limiter_impl.h"
+#include "util/fault_injection_test_env.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !defined(ROCKSDB_LITE)
+
+class DBCompactionTest : public DBTestBase {
+ public:
+ DBCompactionTest() : DBTestBase("/db_compaction_test") {}
+};
+
+class DBCompactionTestWithParam
+ : public DBTestBase,
+ public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+ DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") {
+ max_subcompactions_ = std::get<0>(GetParam());
+ exclusive_manual_compaction_ = std::get<1>(GetParam());
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t max_subcompactions_;
+ bool exclusive_manual_compaction_;
+};
+
+class DBCompactionDirectIOTest : public DBCompactionTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ DBCompactionDirectIOTest() : DBCompactionTest() {}
+};
+
+namespace {
+
+class FlushedFileCollector : public EventListener {
+ public:
+ FlushedFileCollector() {}
+ ~FlushedFileCollector() override {}
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+ std::lock_guard<std::mutex> lock(mutex_);
+ flushed_files_.push_back(info.file_path);
+ }
+
+ std::vector<std::string> GetFlushedFiles() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ std::vector<std::string> result;
+ for (auto fname : flushed_files_) {
+ result.push_back(fname);
+ }
+ return result;
+ }
+
+ void ClearFlushedFiles() { flushed_files_.clear(); }
+
+ private:
+ std::vector<std::string> flushed_files_;
+ std::mutex mutex_;
+};
+
+class CompactionStatsCollector : public EventListener {
+public:
+ CompactionStatsCollector()
+ : compaction_completed_(static_cast<int>(CompactionReason::kNumOfReasons)) {
+ for (auto& v : compaction_completed_) {
+ v.store(0);
+ }
+ }
+
+ ~CompactionStatsCollector() override {}
+
+ void OnCompactionCompleted(DB* /* db */,
+ const CompactionJobInfo& info) override {
+ int k = static_cast<int>(info.compaction_reason);
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ assert(k >= 0 && k < num_of_reasons);
+ compaction_completed_[k]++;
+ }
+
+ void OnExternalFileIngested(
+ DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+ int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+ compaction_completed_[k]++;
+ }
+
+ void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+ int k = static_cast<int>(CompactionReason::kFlush);
+ compaction_completed_[k]++;
+ }
+
+ int NumberOfCompactions(CompactionReason reason) const {
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ int k = static_cast<int>(reason);
+ assert(k >= 0 && k < num_of_reasons);
+ return compaction_completed_.at(k).load();
+ }
+
+private:
+ std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+ SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+ void OnTableFileCreationStarted(
+ const TableFileCreationBriefInfo& /* info */) override {
+ ++num_ssts_creation_started_;
+ }
+
+ int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+ std::atomic<int> num_ssts_creation_started_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions(Options options) {
+ options.compression = kNoCompression;
+ options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+ options.min_write_buffer_number_to_merge = 1;
+ options.max_write_buffer_number_to_maintain = 0;
+ options.num_levels = kCDTNumLevels;
+ options.level0_file_num_compaction_trigger = 1;
+ options.target_file_size_base = options.write_buffer_size * 2;
+ options.target_file_size_multiplier = 2;
+ options.max_bytes_for_level_base =
+ options.target_file_size_base * options.target_file_size_multiplier;
+ options.max_bytes_for_level_multiplier = 2;
+ options.disable_auto_compactions = false;
+ return options;
+}
+
+bool HaveOverlappingKeyRanges(
+ const Comparator* c,
+ const SstFileMetaData& a, const SstFileMetaData& b) {
+ if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+ if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // b.smallestkey <= a.smallestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // a.smallestkey < b.smallestkey <= a.largestkey
+ return true;
+ }
+ if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+ if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+ // b.smallestkey <= a.largestkey <= b.largestkey
+ return true;
+ }
+ } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+ // a.smallestkey <= b.largestkey < a.largestkey
+ return true;
+ }
+ return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+ const ColumnFamilyMetaData& cf_meta,
+ const Comparator* comparator,
+ int min_level, int max_level,
+ const SstFileMetaData* input_file_meta,
+ std::set<std::string>* overlapping_file_names) {
+ std::set<const SstFileMetaData*> overlapping_files;
+ overlapping_files.insert(input_file_meta);
+ for (int m = min_level; m <= max_level; ++m) {
+ for (auto& file : cf_meta.levels[m].files) {
+ for (auto* included_file : overlapping_files) {
+ if (HaveOverlappingKeyRanges(
+ comparator, *included_file, file)) {
+ overlapping_files.insert(&file);
+ overlapping_file_names->insert(file.name);
+ break;
+ }
+ }
+ }
+ }
+}
+
+void VerifyCompactionResult(
+ const ColumnFamilyMetaData& cf_meta,
+ const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+ for (auto& level : cf_meta.levels) {
+ for (auto& file : level.files) {
+ assert(overlapping_file_numbers.find(file.name) ==
+ overlapping_file_numbers.end());
+ }
+ }
+#endif
+}
+
+/*
+ * Verifies compaction stats of cfd are valid.
+ *
+ * For each level of cfd, its compaction stats are valid if
+ * 1) sum(stat.counts) == stat.count, and
+ * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+ */
+void VerifyCompactionStats(ColumnFamilyData& cfd,
+ const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+ InternalStats* internal_stats_ptr = cfd.internal_stats();
+ ASSERT_TRUE(internal_stats_ptr != nullptr);
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
+ internal_stats_ptr->TEST_GetCompactionStats();
+ const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+ std::vector<int> counts(num_of_reasons, 0);
+ // Count the number of compactions caused by each CompactionReason across
+ // all levels.
+ for (const auto& stat : comp_stats) {
+ int sum = 0;
+ for (int i = 0; i < num_of_reasons; i++) {
+ counts[i] += stat.counts[i];
+ sum += stat.counts[i];
+ }
+ ASSERT_EQ(sum, stat.count);
+ }
+ // Verify InternalStats bookkeeping matches that of CompactionStatsCollector,
+ // assuming that all compactions complete.
+ for (int i = 0; i < num_of_reasons; i++) {
+ ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)), counts[i]);
+ }
+#endif /* NDEBUG */
+}
+
+const SstFileMetaData* PickFileRandomly(
+ const ColumnFamilyMetaData& cf_meta,
+ Random* rand,
+ int* level = nullptr) {
+ auto file_id = rand->Uniform(static_cast<int>(
+ cf_meta.file_count)) + 1;
+ for (auto& level_meta : cf_meta.levels) {
+ if (file_id <= level_meta.files.size()) {
+ if (level != nullptr) {
+ *level = level_meta.level;
+ }
+ auto result = rand->Uniform(file_id);
+ return &(level_meta.files[result]);
+ }
+ file_id -= static_cast<uint32_t>(level_meta.files.size());
+ }
+ assert(false);
+ return nullptr;
+}
+} // anonymous namespace
+
+#ifndef ROCKSDB_VALGRIND_RUN
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+ for (int tid = 0; tid < 3; ++tid) {
+ uint64_t db_size[2];
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+
+ if (tid == 1) {
+ // the following only disable stats update in DB::Open()
+ // and should not affect the result of this test.
+ options.skip_stats_update_on_db_open = true;
+ } else if (tid == 2) {
+ // third pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kTestSize = kCDTKeysPerBuffer * 1024;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[0] = Size(Key(0), Key(kTestSize - 1));
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+ // must have much smaller db size.
+ ASSERT_GT(db_size[0] / 3, db_size[1]);
+ }
+}
+#endif // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
+ // For each options type we test following
+ // - Enable preserve_deletes
+ // - write bunch of keys and deletes
+ // - Set start_seqnum to the beginning; compact; check that keys are present
+ // - rewind start_seqnum way forward; compact; check that keys are gone
+
+ for (int tid = 0; tid < 3; ++tid) {
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+ options.preserve_deletes=true;
+ options.num_levels = 2;
+
+ if (tid == 1) {
+ options.skip_stats_update_on_db_open = true;
+ } else if (tid == 2) {
+ // third pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ // highlight the default; all deletes should be preserved
+ SetPreserveDeletesSequenceNumber(0);
+
+ const int kTestSize = kCDTKeysPerBuffer;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ // to ensure we tackle all tombstones
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ // check that normal user iterator doesn't see anything
+ Iterator* db_iter = dbfull()->NewIterator(ReadOptions());
+ int i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 0);
+ delete db_iter;
+
+ // check that iterator that sees internal keys sees tombstones
+ ReadOptions ro;
+ ro.iter_start_seqnum=1;
+ db_iter = dbfull()->NewIterator(ro);
+ i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 4);
+ delete db_iter;
+
+ // now all deletes should be gone
+ SetPreserveDeletesSequenceNumber(100000000);
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ db_iter = dbfull()->NewIterator(ro);
+ i = 0;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ i++;
+ }
+ ASSERT_EQ(i, 0);
+ delete db_iter;
+ }
+}
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+ // This test verify UpdateAccumulatedStats is not on
+ // if options.skip_stats_update_on_db_open = true
+ // The test will need to be updated if the internal behavior changes.
+
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.env = env_;
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+
+ // Reopen the DB with stats-update disabled
+ options.skip_stats_update_on_db_open = true;
+ options.max_open_files = 20;
+ env_->random_file_open_counter_.store(0);
+ Reopen(options);
+
+ // As stats-update is disabled, we expect a very low number of
+ // random file open.
+ // Note that this number must be changed accordingly if we change
+ // the number of files needed to be opened in the DB::Open process.
+ const int kMaxFileOpenCount = 10;
+ ASSERT_LT(env_->random_file_open_counter_.load(), kMaxFileOpenCount);
+
+ // Repeat the reopen process, but this time we enable
+ // stats-update.
+ options.skip_stats_update_on_db_open = false;
+ env_->random_file_open_counter_.store(0);
+ Reopen(options);
+
+ // Since we do a normal stats update on db-open, there
+ // will be more random open files.
+ ASSERT_GT(env_->random_file_open_counter_.load(), kMaxFileOpenCount);
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.new_table_reader_for_compaction_inputs = true;
+ options.max_open_files = 20;
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ int num_table_cache_lookup = 0;
+ int num_new_table_reader = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::FindTable:0", [&](void* arg) {
+ assert(arg != nullptr);
+ bool no_io = *(reinterpret_cast<bool*>(arg));
+ if (!no_io) {
+ // filter out cases for table properties queries.
+ num_table_cache_lookup++;
+ }
+ });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::GetTableReader:0",
+ [&](void* /*arg*/) { num_new_table_reader++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+ ASSERT_OK(Put(Key(k), Key(k)));
+ ASSERT_OK(Put(Key(10 - k), "bar"));
+ if (k < options.level0_file_num_compaction_trigger - 1) {
+ num_table_cache_lookup = 0;
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // preloading iterator issues one table cache lookup and create
+ // a new table reader, if not preloaded.
+ int old_num_table_cache_lookup = num_table_cache_lookup;
+ ASSERT_GE(num_table_cache_lookup, 1);
+ ASSERT_EQ(num_new_table_reader, 1);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(k), Get(Key(k)));
+ // lookup iterator from table cache and no need to create a new one.
+ ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
+ ASSERT_EQ(num_new_table_reader, 0);
+ }
+ }
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ // Preloading iterator issues one table cache lookup and creates
+ // a new table reader. One file is created for flush and one for compaction.
+ // Compaction inputs make no table cache look-up for data/range deletion
+ // iterators
+ // May preload table cache too.
+ ASSERT_GE(num_table_cache_lookup, 2);
+ int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
+ // Create new iterator for:
+ // (1) 1 for verifying flush results
+ // (2) 3 for compaction input files
+ // (3) 1 for verifying compaction results.
+ ASSERT_EQ(num_new_table_reader, 5);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(1), Get(Key(1)));
+ ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+ ASSERT_EQ(num_new_table_reader, 0);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ db_->CompactRange(cro, nullptr, nullptr);
+ // Only verifying compaction outputs issues one table cache lookup
+ // for both data block and range deletion block).
+ // May preload table cache too.
+ ASSERT_GE(num_table_cache_lookup, 1);
+ old_num_table_cache_lookup2 = num_table_cache_lookup;
+ // One for compaction input, one for verifying compaction results.
+ ASSERT_EQ(num_new_table_reader, 2);
+
+ num_table_cache_lookup = 0;
+ num_new_table_reader = 0;
+ ASSERT_EQ(Key(1), Get(Key(1)));
+ ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2);
+ ASSERT_EQ(num_new_table_reader, 0);
+
+ rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+ for (int tid = 0; tid < 2; ++tid) {
+ uint64_t db_size[3];
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.max_subcompactions = max_subcompactions_;
+
+ if (tid == 1) {
+ // second pass with universal compaction
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ }
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // round 1 --- insert key/value pairs.
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[0] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+
+ // round 2 --- disable auto-compactions and issue deletions.
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ db_size[1] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+ // as auto_compaction is off, we shouldn't see too much reduce
+ // in db size.
+ ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+ // round 3 --- reopen db with auto_compaction on and see if
+ // deletion compensation still work.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ // insert relatively small amount of data to trigger auto compaction.
+ for (int k = 0; k < kTestSize / 10; ++k) {
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[2] = Size(Key(0), Key(kTestSize - 1));
+ // this time we're expecting significant drop in size.
+ ASSERT_GT(db_size[0] / 3, db_size[2]);
+ }
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+ uint64_t db_size[3];
+ for (int test = 0; test < 2; ++test) {
+ Options options = DeletionTriggerOptions(CurrentOptions());
+ options.skip_stats_update_on_db_open = (test == 0);
+
+ env_->random_read_counter_.Reset();
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // round 1 --- insert key/value pairs.
+ const int kTestSize = kCDTKeysPerBuffer * 512;
+ std::vector<std::string> values;
+ for (int k = 0; k < kTestSize; ++k) {
+ values.push_back(RandomString(&rnd, kCDTValueSize));
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[0] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+
+ // round 2 --- disable auto-compactions and issue deletions.
+ options.create_if_missing = false;
+ options.disable_auto_compactions = true;
+
+ env_->random_read_counter_.Reset();
+ Reopen(options);
+
+ for (int k = 0; k < kTestSize; ++k) {
+ ASSERT_OK(Delete(Key(k)));
+ }
+ db_size[1] = Size(Key(0), Key(kTestSize - 1));
+ Close();
+ // as auto_compaction is off, we shouldn't see too much reduce
+ // in db size.
+ ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+ // round 3 --- reopen db with auto_compaction on and see if
+ // deletion compensation still work.
+ options.disable_auto_compactions = false;
+ Reopen(options);
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ db_size[2] = Size(Key(0), Key(kTestSize - 1));
+
+ if (options.skip_stats_update_on_db_open) {
+ // If update stats on DB::Open is disable, we don't expect
+ // deletion entries taking effect.
+ ASSERT_LT(db_size[0] / 3, db_size[2]);
+ } else {
+ // Otherwise, we should see a significant drop in db size.
+ ASSERT_GT(db_size[0] / 3, db_size[2]);
+ }
+ }
+}
+
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+ const int kNumKeysPerFile = 100;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_subcompactions = max_subcompactions_;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+ num++) {
+ std::vector<std::string> values;
+ // Write 100KB (100 values, each 1K)
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ values.push_back(RandomString(&rnd, 990));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(1, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+ }
+
+ // generate one more file in level-0, and should trigger level-0 compaction
+ std::vector<std::string> values;
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ values.push_back(RandomString(&rnd, 990));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(1, "", ""));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_F(DBCompactionTest, BGCompactionsAllowed) {
+ // Create several column families. Make compaction triggers in all of them
+ // and see number of compactions scheduled to be less than allowed.
+ const int kNumKeysPerFile = 100;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 3;
+ // Should speed up compaction when there are 4 files.
+ options.level0_file_num_compaction_trigger = 2;
+ options.level0_slowdown_writes_trigger = 20;
+ options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large
+ options.max_background_compactions = 3;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+
+ // Block all threads in thread pool.
+ const size_t kTotalTasks = 4;
+ env_->SetBackgroundThreads(4, Env::LOW);
+ test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i], Env::Priority::LOW);
+ sleeping_tasks[i].WaitUntilSleeping();
+ }
+
+ CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+ Random rnd(301);
+ for (int cf = 0; cf < 4; cf++) {
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+ }
+ }
+
+ // Now all column families qualify compaction but only one should be
+ // scheduled, because no column family hits speed up condition.
+ ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ // Create two more files for one column family, which triggers speed up
+ // condition, three compactions will be scheduled.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(2, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(2, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+ ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+ NumTableFilesAtLevel(0, 2));
+ }
+ ASSERT_EQ(3, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ // Unblock all threads to unblock all compactions.
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify number of compactions allowed will come back to 1.
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].Reset();
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_tasks[i], Env::Priority::LOW);
+ sleeping_tasks[i].WaitUntilSleeping();
+ }
+ for (int cf = 0; cf < 4; cf++) {
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+ }
+ }
+
+ // Now all column families qualify compaction but only one should be
+ // scheduled, because no column family hits speed up condition.
+ ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+ for (size_t i = 0; i < kTotalTasks; i++) {
+ sleeping_tasks[i].WakeUp();
+ sleeping_tasks[i].WaitUntilDone();
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put(1, Key(i), values[i]));
+ }
+
+ // Reopening moves updates to level-0
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+ true /* disallow trivial move */);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+ for (int i = 0; i < 80; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10000;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ const int N = 500;
+
+ int starting_num_tables = TotalTableFiles(1);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+ }
+ int ending_num_tables = TotalTableFiles(1);
+ ASSERT_GT(ending_num_tables, starting_num_tables);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+ }
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile1) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ Put("4", "A");
+ Put("3", "A");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ Put("2", "A");
+ Delete("3");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ // move both files down to l1
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ for (int i = 0; i < 3; i++) {
+ Put("2", "B");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile2) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ DestroyAndReopen(options);
+
+ // create first file and flush to l0
+ Put("4", "A");
+ Put("3", "A");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ Put("2", "A");
+ SingleDelete("3");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ // move both files down to l1
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+
+ for (int i = 0; i < 3; i++) {
+ Put("2", "B");
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+
+ // compaction options
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ compact_opt.output_file_size_limit = 4096;
+ const size_t key_len =
+ static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
+
+ DestroyAndReopen(options);
+
+ std::vector<const Snapshot*> snaps;
+
+ // create first file and flush to l0
+ for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
+ Put(key, std::string(key_len, 'A'));
+ snaps.push_back(dbfull()->GetSnapshot());
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ // create second file and flush to l0
+ for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
+ Put(key, std::string(key_len, 'A'));
+ snaps.push_back(dbfull()->GetSnapshot());
+ }
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+
+ // move both files down to l1
+ dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1);
+
+ // release snap so that first instance of key(3) can have seqId=0
+ for (auto snap : snaps) {
+ dbfull()->ReleaseSnapshot(snap);
+ }
+
+ // create 3 files in l0 so to trigger compaction
+ for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+ Put("2", std::string(1, 'A'));
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_OK(Put("", ""));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
+ // github issue #2249
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.level0_file_num_compaction_trigger = 3;
+ DestroyAndReopen(options);
+
+ // create two files in l1 that we can compact
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
+ // make l0 files' ranges overlap to avoid trivial move
+ Put(std::to_string(2 * i), std::string(1, 'A'));
+ Put(std::to_string(2 * i + 1), std::string(1, 'A'));
+ Flush();
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1);
+ }
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_EQ(2, cf_meta.levels[1].files.size());
+ std::vector<std::string> input_filenames;
+ for (const auto& sst_file : cf_meta.levels[1].files) {
+ input_filenames.push_back(sst_file.name);
+ }
+
+ // note CompactionOptions::output_file_size_limit is unset.
+ CompactionOptions compact_opt;
+ compact_opt.compression = kNoCompression;
+ dbfull()->CompactFiles(compact_opt, input_filenames, 1);
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // Trigger a long memtable compaction and reopen the database during it
+ ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file
+ ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable
+ ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction
+ ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file
+
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_EQ("v1", Get(1, "foo"));
+ ASSERT_EQ("v2", Get(1, "bar"));
+ ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+ ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+ } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+ int32_t trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ int32_t num_keys = 80;
+ int32_t value_size = 100 * 1024; // 100 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ for (int i = 0; i < num_keys; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+
+ // Reopening moves updates to L0
+ Reopen(options);
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ LiveFileMetaData level0_file = metadata[0]; // L0 file meta
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+ // Compaction will initiate a trivial move from L0 to L1
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+
+ // File moved From L0 to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1
+
+ metadata.clear();
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+ ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+ for (int i = 0; i < num_keys; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+
+ ASSERT_EQ(trivial_move, 1);
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ // non overlapping ranges
+ std::vector<std::pair<int32_t, int32_t>> ranges = {
+ {100, 199},
+ {300, 399},
+ {0, 99},
+ {200, 299},
+ {600, 699},
+ {400, 499},
+ {500, 550},
+ {551, 599},
+ };
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ int32_t level0_files = NumTableFilesAtLevel(0, 0);
+ ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1
+
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+ // Since data is non-overlapping we expect compaction to initiate
+ // a trivial move
+ db_->CompactRange(cro, nullptr, nullptr);
+ // We expect that all the files were trivially moved from L0 to L1
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ trivial_move = 0;
+ non_trivial_move = 0;
+ values.clear();
+ DestroyAndReopen(options);
+ // Same ranges as above but overlapping
+ ranges = {
+ {100, 199},
+ {300, 399},
+ {0, 99},
+ {200, 299},
+ {600, 699},
+ {400, 499},
+ {500, 560}, // this range overlap with the next one
+ {551, 599},
+ };
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ db_->CompactRange(cro, nullptr, nullptr);
+
+ for (size_t i = 0; i < ranges.size(); i++) {
+ for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+ ASSERT_EQ(Get(Key(j)), values[j]);
+ }
+ }
+ ASSERT_EQ(trivial_move, 0);
+ ASSERT_EQ(non_trivial_move, 1);
+
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.num_levels = 7;
+ options.max_subcompactions = max_subcompactions_;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 300]
+ for (int32_t i = 0; i <= 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [600 => 700]
+ for (int32_t i = 600; i <= 700; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 6;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L6
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int32_t i = 0; i <= 300; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ for (int32_t i = 600; i <= 700; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ bool first = true;
+ // Purpose of dependencies:
+ // 4 -> 1: ensure the order of two non-trivial compactions
+ // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
+ // are installed
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
+ {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
+ {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (first) {
+ first = false;
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
+ } else { // second non-trivial compaction
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
+ }
+ });
+
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.num_levels = 7;
+ options.max_subcompactions = max_subcompactions_;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+ options.target_file_size_base = 1 << 23; // 8 MB
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 6;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ // Trivial move the two non-overlapping files to level 6
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L6
+ ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 1 files in L0
+ ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
+ ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
+ // 2 files in L6, 1 file in L5
+ ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 6);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ rocksdb::port::Thread threads([&] {
+ compact_options.change_level = false;
+ compact_options.exclusive_manual_compaction = false;
+ std::string begin_string = Key(0);
+ std::string end_string = Key(199);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ // First non-trivial compaction is triggered
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ });
+
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
+ // file 4 [300 => 400)
+ for (int32_t i = 300; i <= 400; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 5 [400 => 500)
+ for (int32_t i = 400; i <= 500; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 6 [500 => 600)
+ for (int32_t i = 500; i <= 600; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ // Second non-trivial compaction is triggered
+ ASSERT_OK(Flush());
+
+ // Before two non-trivial compactions are installed, there are 3 files in L0
+ ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
+ TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
+
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ // After two non-trivial compactions are installed, there is 1 file in L6, and
+ // 1 file in L1
+ ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
+ threads.join();
+
+ for (int32_t i = 0; i < 600; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+// Disable as the test is flaky.
+TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ bool first = true;
+ bool second = true;
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
+ {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+ if (first) {
+ TEST_SYNC_POINT("DBCompaction::PartialFill:4");
+ first = false;
+ TEST_SYNC_POINT("DBCompaction::PartialFill:3");
+ } else if (second) {
+ }
+ });
+
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+
+ DestroyAndReopen(options);
+ // make sure all background compaction jobs can be scheduled
+ auto stop_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L2
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L2, 1 in L0
+ ASSERT_EQ("1,0,2", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+ // 2 files in L2, 1 in L1
+ ASSERT_EQ("0,1,2", FilesPerLevel(0));
+
+ ASSERT_EQ(trivial_move, 2);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ rocksdb::port::Thread threads([&] {
+ compact_options.change_level = false;
+ compact_options.exclusive_manual_compaction = false;
+ std::string begin_string = Key(0);
+ std::string end_string = Key(199);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+ });
+
+ TEST_SYNC_POINT("DBCompaction::PartialFill:1");
+ // Many files 4 [300 => 4300)
+ for (int32_t i = 0; i <= 5; i++) {
+ for (int32_t j = 300; j < 4300; j++) {
+ if (j == 2300) {
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ }
+
+ // Verify level sizes
+ uint64_t target_size = 4 * options.max_bytes_for_level_base;
+ for (int32_t i = 1; i < options.num_levels; i++) {
+ ASSERT_LE(SizeAtLevel(i), target_size);
+ target_size = static_cast<uint64_t>(target_size *
+ options.max_bytes_for_level_multiplier);
+ }
+
+ TEST_SYNC_POINT("DBCompaction::PartialFill:2");
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+ threads.join();
+
+ for (int32_t i = 0; i < 4300; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRange) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_background_compactions = 3;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ // Add 2 non-overlapping files
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file 1 [0 => 100]
+ for (int32_t i = 0; i < 100; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // file 2 [100 => 300]
+ for (int32_t i = 100; i < 300; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // 2 files in L0
+ ASSERT_EQ("2", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ // 2 files in L2
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // file 3 [ 0 => 200]
+ for (int32_t i = 0; i < 200; i++) {
+ values[i] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ // Many files 4 [300 => 4300)
+ for (int32_t i = 0; i <= 5; i++) {
+ for (int32_t j = 300; j < 4300; j++) {
+ if (j == 2300) {
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+ values[j] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(j), values[j]));
+ }
+ }
+ ASSERT_OK(Flush());
+ dbfull()->TEST_WaitForFlushMemTable();
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify level sizes
+ uint64_t target_size = 4 * options.max_bytes_for_level_base;
+ for (int32_t i = 1; i < options.num_levels; i++) {
+ ASSERT_LE(SizeAtLevel(i), target_size);
+ target_size = static_cast<uint64_t>(target_size *
+ options.max_bytes_for_level_multiplier);
+ }
+
+ size_t old_num_files = CountFiles();
+ std::string begin_string = Key(1000);
+ std::string end_string = Key(2000);
+ Slice begin(begin_string);
+ Slice end(end_string);
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+
+ int32_t deleted_count = 0;
+ for (int32_t i = 0; i < 4300; i++) {
+ if (i < 1000 || i > 2000) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ } else {
+ ReadOptions roptions;
+ std::string result;
+ Status s = db_->Get(roptions, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound() || s.ok());
+ if (s.IsNotFound()) {
+ deleted_count++;
+ }
+ }
+ }
+ ASSERT_GT(deleted_count, 0);
+ begin_string = Key(5000);
+ end_string = Key(6000);
+ Slice begin1(begin_string);
+ Slice end1(end_string);
+ // Try deleting files in range which contain no keys
+ ASSERT_OK(
+ DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+
+ // Push data from level 0 to level 1 to force all data to be deleted
+ // Note that we don't delete level 0 files
+ compact_options.change_level = true;
+ compact_options.target_level = 1;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_OK(
+ DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+
+ int32_t deleted_count2 = 0;
+ for (int32_t i = 0; i < 4300; i++) {
+ ReadOptions roptions;
+ std::string result;
+ Status s = db_->Get(roptions, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ deleted_count2++;
+ }
+ ASSERT_GT(deleted_count2, deleted_count);
+ size_t new_num_files = CountFiles();
+ ASSERT_GT(old_num_files, new_num_files);
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10 * 1024 * 1024;
+ options.max_bytes_for_level_multiplier = 2;
+ options.num_levels = 4;
+ options.max_background_compactions = 3;
+ options.disable_auto_compactions = true;
+
+ DestroyAndReopen(options);
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::map<int32_t, std::string> values;
+
+ // file [0 => 100), [100 => 200), ... [900, 1000)
+ for (auto i = 0; i < 10; i++) {
+ for (auto j = 0; j < 100; j++) {
+ auto k = i * 100 + j;
+ values[k] = RandomString(&rnd, value_size);
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("10", FilesPerLevel(0));
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 2;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+ // file [0 => 100), [200 => 300), ... [800, 900)
+ for (auto i = 0; i < 10; i+=2) {
+ for (auto j = 0; j < 100; j++) {
+ auto k = i * 100 + j;
+ ASSERT_OK(Put(Key(k), values[k]));
+ }
+ ASSERT_OK(Flush());
+ }
+ ASSERT_EQ("5,0,10", FilesPerLevel(0));
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+ ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+ // Delete files in range [0, 299] (inclusive)
+ {
+ auto begin_str1 = Key(0), end_str1 = Key(100);
+ auto begin_str2 = Key(100), end_str2 = Key(200);
+ auto begin_str3 = Key(200), end_str3 = Key(299);
+ Slice begin1(begin_str1), end1(end_str1);
+ Slice begin2(begin_str2), end2(end_str2);
+ Slice begin3(begin_str3), end3(end_str3);
+ std::vector<RangePtr> ranges;
+ ranges.push_back(RangePtr(&begin1, &end1));
+ ranges.push_back(RangePtr(&begin2, &end2));
+ ranges.push_back(RangePtr(&begin3, &end3));
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+ ranges.data(), ranges.size()));
+ ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+ // Keys [0, 300) should not exist.
+ for (auto i = 0; i < 300; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ for (auto i = 300; i < 1000; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ }
+
+ // Delete files in range [600, 999) (exclusive)
+ {
+ auto begin_str1 = Key(600), end_str1 = Key(800);
+ auto begin_str2 = Key(700), end_str2 = Key(900);
+ auto begin_str3 = Key(800), end_str3 = Key(999);
+ Slice begin1(begin_str1), end1(end_str1);
+ Slice begin2(begin_str2), end2(end_str2);
+ Slice begin3(begin_str3), end3(end_str3);
+ std::vector<RangePtr> ranges;
+ ranges.push_back(RangePtr(&begin1, &end1));
+ ranges.push_back(RangePtr(&begin2, &end2));
+ ranges.push_back(RangePtr(&begin3, &end3));
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+ ranges.data(), ranges.size(), false));
+ ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+ // Keys [600, 900) should not exist.
+ for (auto i = 600; i < 900; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ for (auto i = 300; i < 600; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ for (auto i = 900; i < 1000; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+ }
+
+ // Delete all files.
+ {
+ RangePtr range;
+ ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+ ASSERT_EQ("", FilesPerLevel(0));
+
+ for (auto i = 0; i < 1000; i++) {
+ ReadOptions ropts;
+ std::string result;
+ auto s = db_->Get(ropts, Key(i), &result);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+ // regression test for #2833: groups of files whose user-keys overlap at the
+ // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+ // reappear, either because a new version of the key was removed, or a range
+ // deletion was partially dropped. It could also cause non-overlapping
+ // invariant to be violated if the files dropped by DeleteFilesInRange were
+ // a subset of files that a range deletion spans.
+ const int kNumL0Files = 2;
+ const int kValSize = 8 << 10; // 8KB
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.target_file_size_base = 1 << 10; // 1KB
+ DestroyAndReopen(options);
+
+ // The snapshot prevents key 1 from having its old version dropped. The low
+ // `target_file_size_base` ensures two keys will be in each output file.
+ const Snapshot* snapshot = nullptr;
+ Random rnd(301);
+ // The value indicates which flush the key belonged to, which is enough
+ // for us to determine the keys' relative ages. After L0 flushes finish,
+ // files look like:
+ //
+ // File 0: 0 -> vals[0], 1 -> vals[0]
+ // File 1: 1 -> vals[1], 2 -> vals[1]
+ //
+ // Then L0->L1 compaction happens, which outputs keys as follows:
+ //
+ // File 0: 0 -> vals[0], 1 -> vals[1]
+ // File 1: 1 -> vals[0], 2 -> vals[1]
+ //
+ // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+ // would cause `1 -> vals[0]` (an older key) to reappear.
+ std::string vals[kNumL0Files];
+ for (int i = 0; i < kNumL0Files; ++i) {
+ vals[i] = RandomString(&rnd, kValSize);
+ Put(Key(i), vals[i]);
+ Put(Key(i + 1), vals[i]);
+ Flush();
+ if (i == 0) {
+ snapshot = db_->GetSnapshot();
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+ // "1 -> vals[0]" to reappear.
+ std::string begin_str = Key(0), end_str = Key(1);
+ Slice begin = begin_str, end = end_str;
+ ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+ ASSERT_EQ(vals[1], Get(Key(1)));
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ // File with keys [ 0 => 99 ]
+ for (int i = 0; i < 100; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 3;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // File with keys [ 100 => 199 ]
+ for (int i = 100; i < 200; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 4);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int i = 0; i < 200; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // options = CurrentOptions(options);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(options.db_paths[1].path, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+ }
+ env_->DeleteDir(options.db_paths[1].path);
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up first path
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 1)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,1", FilesPerLevel(0));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 2)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,2", FilesPerLevel(0));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 3)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,3", FilesPerLevel(0));
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 5)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,5", FilesPerLevel(0));
+ ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 6)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,6", FilesPerLevel(0));
+ ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 7)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,7", FilesPerLevel(0));
+ ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ // (1, 4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+ ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // options = CurrentOptions(options);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(options.db_paths[1].path, &filenames);
+ // Delete archival files.
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+ }
+ env_->DeleteDir(options.db_paths[1].path);
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Always gets compacted into 1 Level1 file,
+ // 0/1 Level 0 file
+ for (int num = 0; num < 3; num++) {
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,1", FilesPerLevel(0));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_, 500 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+ options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+
+ std::vector<Options> option_vector;
+ option_vector.emplace_back(options);
+ ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+ // Configure CF1 specific paths.
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+ cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt1);
+ CreateColumnFamilies({"one"},option_vector[1]);
+
+ // Configura CF2 specific paths.
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+ cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+ option_vector.emplace_back(DBOptions(options), cf_opt2);
+ CreateColumnFamilies({"two"},option_vector[2]);
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ Random rnd(301);
+ int key_idx = 0;
+ int key_idx1 = 0;
+ int key_idx2 = 0;
+
+ auto generate_file = [&]() {
+ GenerateNewFile(0, &rnd, &key_idx);
+ GenerateNewFile(1, &rnd, &key_idx1);
+ GenerateNewFile(2, &rnd, &key_idx2);
+ };
+
+ auto check_sstfilecount = [&](int path_id, int expected) {
+ ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+ ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+ };
+
+ auto check_filesperlevel = [&](const std::string& expected) {
+ ASSERT_EQ(expected, FilesPerLevel(0));
+ ASSERT_EQ(expected, FilesPerLevel(1));
+ ASSERT_EQ(expected, FilesPerLevel(2));
+ };
+
+ auto check_getvalues = [&]() {
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(0, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx1; i++) {
+ auto v = Get(1, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ for (int i = 0; i < key_idx2; i++) {
+ auto v = Get(2, Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+ };
+
+ // Check that default column family uses db_paths.
+ // And Column family "one" uses cf_paths.
+
+ // First three 110KB files are not going to second path.
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ generate_file();
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up first path
+ generate_file();
+ check_sstfilecount(1, 3);
+
+ // (1, 4)
+ generate_file();
+ check_filesperlevel("1,4");
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ // (1, 4, 1)
+ generate_file();
+ check_filesperlevel("1,4,1");
+ check_sstfilecount(2, 1);
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ // (1, 4, 2)
+ generate_file();
+ check_filesperlevel("1,4,2");
+ check_sstfilecount(2, 2);
+ check_sstfilecount(1, 4);
+ check_sstfilecount(0, 1);
+
+ check_getvalues();
+
+ ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+ check_getvalues();
+
+ Destroy(options, true);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+ Random rnd(301);
+ int max_key_level_insert = 200;
+ int max_key_universal_insert = 600;
+
+ // Stage 1: generate a db with level compaction
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.num_levels = 4;
+ options.level0_file_num_compaction_trigger = 3;
+ options.max_bytes_for_level_base = 500 << 10; // 500KB
+ options.max_bytes_for_level_multiplier = 1;
+ options.target_file_size_base = 200 << 10; // 200KB
+ options.target_file_size_multiplier = 1;
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ for (int i = 0; i <= max_key_level_insert; i++) {
+ // each value is 10K
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ }
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_GT(TotalTableFiles(1, 4), 1);
+ int non_level0_num_files = 0;
+ for (int i = 1; i < options.num_levels; i++) {
+ non_level0_num_files += NumTableFilesAtLevel(i, 1);
+ }
+ ASSERT_GT(non_level0_num_files, 0);
+
+ // Stage 2: reopen with universal compaction - should fail
+ options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 1;
+ options = CurrentOptions(options);
+ Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Stage 3: compact into a single file and move the file to level 0
+ options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ options.target_file_size_base = INT_MAX;
+ options.target_file_size_multiplier = 1;
+ options.max_bytes_for_level_base = INT_MAX;
+ options.max_bytes_for_level_multiplier = 1;
+ options.num_levels = 4;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 0;
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForce;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+ // Only 1 file in L0
+ ASSERT_EQ("1", FilesPerLevel(1));
+
+ // Stage 4: re-open in universal compaction style and do some db operations
+ options = CurrentOptions();
+ options.compaction_style = kCompactionStyleUniversal;
+ options.num_levels = 4;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 3;
+ options = CurrentOptions(options);
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ options.num_levels = 1;
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+ for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+ ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+ }
+ dbfull()->Flush(FlushOptions());
+ ASSERT_OK(Flush(1));
+ dbfull()->TEST_WaitForCompact();
+
+ for (int i = 1; i < options.num_levels; i++) {
+ ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+ }
+
+ // verify keys inserted in both level compaction style and universal
+ // compaction style
+ std::string keys_in_db;
+ Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ keys_in_db.append(iter->key().ToString());
+ keys_in_db.push_back(',');
+ }
+ delete iter;
+
+ std::string expected_keys;
+ for (int i = 0; i <= max_key_universal_insert; i++) {
+ expected_keys.append(Key(i));
+ expected_keys.push_back(',');
+ }
+
+ ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "b", "v"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "b"));
+ ASSERT_OK(Delete(1, "a"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Delete(1, "a"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "a", "v"));
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("(a->v)", Contents(1));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ASSERT_EQ("(a->v)", Contents(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Delete(1, "e");
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "c", "cv");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "d", "dv");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Put(1, "", "");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ Delete(1, "d");
+ Delete(1, "b");
+ ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+ ASSERT_EQ("(->)(c->cv)", Contents(1));
+ env_->SleepForMicroseconds(1000000); // Wait for compaction to finish
+ ASSERT_EQ("(->)(c->cv)", Contents(1));
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, ManualAutoRace) {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+ {"DBImpl::RunManualCompaction:WaitScheduled",
+ "BackgroundCallCompaction:0"}});
+
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Put(1, "foo", "");
+ Put(1, "bar", "");
+ Flush(1);
+ Put(1, "foo", "");
+ Put(1, "bar", "");
+ // Generate four files in CF 0, which should trigger an auto compaction
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+ Put("foo", "");
+ Put("bar", "");
+ Flush();
+
+ // The auto compaction is scheduled but waited until here
+ TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+ // The auto compaction will wait until the manual compaction is registerd
+ // before processing so that it will be cancelled.
+ dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+
+ // Eventually the cancelled compaction will be rescheduled and executed.
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+ Options options = CurrentOptions();
+ options.max_subcompactions = max_subcompactions_;
+ options.statistics = rocksdb::CreateDBStatistics();
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p1", "p9");
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ // Populate a different range
+ MakeTables(3, "c", "e", 1);
+ ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+ // Compact all
+ MakeTables(1, "a", "z", 1);
+ ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+ uint64_t prev_block_cache_add =
+ options.statistics->getTickerCount(BLOCK_CACHE_ADD);
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+ // Verify manual compaction doesn't fill block cache
+ ASSERT_EQ(prev_block_cache_add,
+ options.statistics->getTickerCount(BLOCK_CACHE_ADD));
+
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+ if (iter == 0) {
+ options = CurrentOptions();
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ options.statistics = rocksdb::CreateDBStatistics();
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+ options.max_subcompactions = max_subcompactions_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ // iter - 0 with 7 levels
+ // iter - 1 with 3 levels
+ for (int iter = 0; iter < 2; ++iter) {
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put(1, "p", "begin"));
+ ASSERT_OK(Put(1, "q", "end"));
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_EQ("3", FilesPerLevel(1));
+ ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Compaction range falls before files
+ Compact(1, "", "c");
+ ASSERT_EQ("3", FilesPerLevel(1));
+
+ // Compaction range falls after files
+ Compact(1, "r", "z");
+ ASSERT_EQ("3", FilesPerLevel(1));
+
+ // Compaction range overlaps files
+ Compact(1, "p1", "p9", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Populate a different range
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put(1, "c", "begin"));
+ ASSERT_OK(Put(1, "e", "end"));
+ ASSERT_OK(Flush(1));
+ }
+ ASSERT_EQ("3,1", FilesPerLevel(1));
+
+ // Compact just the new range
+ Compact(1, "b", "f", 1);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ ASSERT_EQ("0,2", FilesPerLevel(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ // Compact all
+ ASSERT_OK(Put(1, "a", "begin"));
+ ASSERT_OK(Put(1, "z", "end"));
+ ASSERT_OK(Flush(1));
+ ASSERT_EQ("1,2", FilesPerLevel(1));
+ ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+ CompactRangeOptions compact_options;
+ compact_options.target_path_id = 1;
+ compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+ db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_EQ("0,1", FilesPerLevel(1));
+ ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+ ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+ ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+ if (iter == 0) {
+ DestroyAndReopen(options);
+ options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+ options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+ options.max_background_flushes = 1;
+ options.num_levels = 3;
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+ do {
+ CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+ ASSERT_OK(Put(1, "foo", "v2"));
+ Compact(1, "a", "z");
+ const size_t num_files = CountLiveFiles();
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put(1, "foo", "v2"));
+ Compact(1, "a", "z");
+ }
+ ASSERT_EQ(CountLiveFiles(), num_files);
+ } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+ const int kTestKeySize = 16;
+ const int kTestValueSize = 984;
+ const int kEntrySize = kTestKeySize + kTestValueSize;
+ const int kEntriesPerBuffer = 100;
+ Options options;
+ options.create_if_missing = true;
+ options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+ options.compaction_style = kCompactionStyleLevel;
+ options.target_file_size_base = options.write_buffer_size;
+ options.max_bytes_for_level_base = options.target_file_size_base * 2;
+ options.level0_stop_writes_trigger = 2;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Random rnd(301);
+ for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+ }
+ dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+ dbfull()->TEST_WaitForCompact();
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+ for (int file_picked = 5; file_picked > 0; --file_picked) {
+ std::set<std::string> overlapping_file_names;
+ std::vector<std::string> compaction_input_file_names;
+ for (int f = 0; f < file_picked; ++f) {
+ int level = 0;
+ auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+ compaction_input_file_names.push_back(file_meta->name);
+ GetOverlappingFileNumbersForLevelCompaction(
+ cf_meta, options.comparator, level, output_level,
+ file_meta, &overlapping_file_names);
+ }
+
+ ASSERT_OK(dbfull()->CompactFiles(
+ CompactionOptions(), handles_[1],
+ compaction_input_file_names,
+ output_level));
+
+ // Make sure all overlapping files do not exist after compaction
+ dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+ VerifyCompactionResult(cf_meta, overlapping_file_names);
+ }
+
+ // make sure all key-values are still there.
+ for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+ ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+ Options options;
+ const int kKeySize = 16;
+ const int kKvSize = 1000;
+ const int kKeysPerBuffer = 100;
+ const int kNumL1Files = 5;
+ options.create_if_missing = true;
+ options.write_buffer_size = kKeysPerBuffer * kKvSize;
+ options.max_write_buffer_number = 2;
+ options.target_file_size_base =
+ options.write_buffer_size *
+ (options.max_write_buffer_number - 1);
+ options.level0_file_num_compaction_trigger = kNumL1Files;
+ options.max_bytes_for_level_base =
+ options.level0_file_num_compaction_trigger *
+ options.target_file_size_base;
+ options.max_bytes_for_level_multiplier = 2;
+ options.compression = kNoCompression;
+ options.max_subcompactions = max_subcompactions_;
+
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ env_->SetBackgroundThreads(1, Env::LOW);
+ // stop the compaction thread until we simulate the file creation failure.
+ test::SleepingBackgroundTask sleeping_task_low;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+ Env::Priority::LOW);
+
+ options.env = env_;
+
+ DestroyAndReopen(options);
+
+ const int kNumInsertedKeys =
+ options.level0_file_num_compaction_trigger *
+ (options.max_write_buffer_number - 1) *
+ kKeysPerBuffer;
+
+ Random rnd(301);
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ keys.emplace_back(RandomString(&rnd, kKeySize));
+ values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+ ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+ dbfull()->TEST_WaitForFlushMemTable();
+ }
+
+ dbfull()->TEST_FlushMemTable(true);
+ // Make sure the number of L0 files can trigger compaction.
+ ASSERT_GE(NumTableFilesAtLevel(0),
+ options.level0_file_num_compaction_trigger);
+
+ auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+ // Fail the first file creation.
+ env_->non_writable_count_ = 1;
+ sleeping_task_low.WakeUp();
+ sleeping_task_low.WaitUntilDone();
+
+ // Expect compaction to fail here as one file will fail its
+ // creation.
+ ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+ // Verify L0 -> L1 compaction does fail.
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ // Verify all L0 files are still there.
+ ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+ // All key-values must exist after compaction fails.
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ ASSERT_EQ(values[k], Get(keys[k]));
+ }
+
+ env_->non_writable_count_ = 0;
+
+ // Make sure RocksDB will not get into corrupted state.
+ Reopen(options);
+
+ // Verify again after reopen.
+ for (int k = 0; k < kNumInsertedKeys; ++k) {
+ ASSERT_EQ(values[k], Get(keys[k]));
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+ // iter 1 -- delete_obsolete_files_period_micros == 0
+ for (int iter = 0; iter < 2; ++iter) {
+ // This test triggers move compaction and verifies that the file is not
+ // deleted when it's part of move compaction
+ Options options = CurrentOptions();
+ options.env = env_;
+ if (iter == 1) {
+ options.delete_obsolete_files_period_micros = 0;
+ }
+ options.create_if_missing = true;
+ options.level0_file_num_compaction_trigger =
+ 2; // trigger compaction when we have 2 files
+ OnFileDeletionListener* listener = new OnFileDeletionListener();
+ options.listeners.emplace_back(listener);
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ // Create two 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute L0->L1
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+
+ // block compactions
+ test::SleepingBackgroundTask sleeping_task;
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+ Env::Priority::LOW);
+
+ options.max_bytes_for_level_base = 1024 * 1024; // 1 MB
+ Reopen(options);
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ ASSERT_EQ("0,1", FilesPerLevel(0));
+ // let compactions go
+ sleeping_task.WakeUp();
+ sleeping_task.WaitUntilDone();
+
+ // this should execute L1->L2 (move)
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ ASSERT_EQ(metadata.size(), 1U);
+ auto moved_file_name = metadata[0].name;
+
+ // Create two more 1MB sst files
+ for (int i = 0; i < 2; ++i) {
+ // Create 1MB sst file
+ for (int j = 0; j < 100; ++j) {
+ ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+ }
+ ASSERT_OK(Flush());
+ }
+ // this should execute both L0->L1 and L1->L2 (merge with previous file)
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+ // iterator is holding the file
+ ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+ listener->SetExpectedFileName(dbname_ + moved_file_name);
+ iterator.reset();
+
+ // this file should have been compacted away
+ ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+ listener->VerifyMatchedCount(1);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+ if (!Zlib_Supported()) {
+ return;
+ }
+ Options options = CurrentOptions();
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 2;
+ options.num_levels = 4;
+ options.max_bytes_for_level_base = 400 * 1024;
+ options.max_subcompactions = max_subcompactions_;
+ // First two levels have no compression, so that a trivial move between
+ // them will be allowed. Level 2 has Zlib compression so that a trivial
+ // move to level 3 will not be allowed
+ options.compression_per_level = {kNoCompression, kNoCompression,
+ kZlibCompression};
+ int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::InputCompressionMatchesOutput:Matches",
+ [&](void* /*arg*/) { matches++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "Compaction::InputCompressionMatchesOutput:DidntMatch",
+ [&](void* /*arg*/) { didnt_match++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // First three 110KB files are going to level 0
+ // After that, (100K, 200K)
+ for (int num = 0; num < 3; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+
+ // Another 110KB triggers a compaction to 400K file to fill up level 0
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+ // (1, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4", FilesPerLevel(0));
+
+ // (1, 4, 1)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+ // (1, 4, 2)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+ // (1, 4, 3)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+ // (1, 4, 4)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+ // (1, 4, 5)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+ // (1, 4, 6)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+ // (1, 4, 7)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+ // (1, 4, 8)
+ GenerateNewFile(&rnd, &key_idx);
+ ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+ ASSERT_EQ(matches, 12);
+ // Currently, the test relies on the number of calls to
+ // InputCompressionMatchesOutput() per compaction.
+ const int kCallsToInputCompressionMatch = 2;
+ ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+ ASSERT_EQ(trivial_move, 12);
+ ASSERT_EQ(non_trivial, 8);
+
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Reopen(options);
+
+ for (int i = 0; i < key_idx; i++) {
+ auto v = Get(Key(i));
+ ASSERT_NE(v, "NOT_FOUND");
+ ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+ }
+
+ Destroy(options);
+}
+
+TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
+ Options options = CurrentOptions();
+ options.max_background_compactions = 5;
+ options.soft_pending_compaction_bytes_limit = 0;
+ options.hard_pending_compaction_bytes_limit = 100;
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+ ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
+
+ options.max_background_compactions = 3;
+ options.soft_pending_compaction_bytes_limit = 200;
+ options.hard_pending_compaction_bytes_limit = 150;
+ DestroyAndReopen(options);
+ ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+ Options options = CurrentOptions();
+ options.compaction_style = kCompactionStyleLevel;
+ options.write_buffer_size = 110 << 10;
+ options.arena_block_size = 4 << 10;
+ options.level0_file_num_compaction_trigger = 4;
+ options.num_levels = 4;
+ options.compression = kNoCompression;
+ options.max_bytes_for_level_base = 450 << 10;
+ options.target_file_size_base = 98 << 10;
+ options.max_write_buffer_number = 2;
+ options.max_background_compactions = 2;
+
+ DestroyAndReopen(options);
+
+ // fill up the DB
+ Random rnd(301);
+ for (int num = 0; num < 10; num++) {
+ GenerateNewRandomFile(&rnd);
+ }
+ db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"CompactionJob::Run():Start",
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+ {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+ "CompactionJob::Run():End"}});
+
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ // trigger L0 compaction
+ for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+ num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(Flush());
+ }
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+ for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+ num++) {
+ GenerateNewRandomFile(&rnd, /* nowait */ true);
+ ASSERT_OK(Flush());
+ }
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+ dbfull()->TEST_WaitForCompact();
+}
+
+static std::string ShortKey(int i) {
+ assert(i < 10000);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%04d", i);
+ return std::string(buf);
+}
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+ int32_t trivial_move = 0;
+ int32_t non_trivial_move = 0;
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:TrivialMove",
+ [&](void* /*arg*/) { trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:NonTrivial",
+ [&](void* /*arg*/) { non_trivial_move++; });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ // The key size is guaranteed to be <= 8
+ class ShortKeyComparator : public Comparator {
+ int Compare(const rocksdb::Slice& a,
+ const rocksdb::Slice& b) const override {
+ assert(a.size() <= 8);
+ assert(b.size() <= 8);
+ return BytewiseComparator()->Compare(a, b);
+ }
+ const char* Name() const override { return "ShortKeyComparator"; }
+ void FindShortestSeparator(std::string* start,
+ const rocksdb::Slice& limit) const override {
+ return BytewiseComparator()->FindShortestSeparator(start, limit);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ return BytewiseComparator()->FindShortSuccessor(key);
+ }
+ } short_key_cmp;
+ Options options = CurrentOptions();
+ options.target_file_size_base = 100000000;
+ options.write_buffer_size = 100000000;
+ options.max_subcompactions = max_subcompactions_;
+ options.comparator = &short_key_cmp;
+ DestroyAndReopen(options);
+
+ int32_t value_size = 10 * 1024; // 10 KB
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ // File with keys [ 0 => 99 ]
+ for (int i = 0; i < 100; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 3;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 1);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ // File with keys [ 100 => 199 ]
+ for (int i = 100; i < 200; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ // then compacte the bottommost level L3=>L3 (non trivial move)
+ compact_options = CompactRangeOptions();
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 4);
+ ASSERT_EQ(non_trivial_move, 1);
+
+ // File with keys [ 200 => 299 ]
+ for (int i = 200; i < 300; i++) {
+ values.push_back(RandomString(&rnd, value_size));
+ ASSERT_OK(Put(ShortKey(i), values[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+ trivial_move = 0;
+ non_trivial_move = 0;
+ compact_options = CompactRangeOptions();
+ compact_options.bottommost_level_compaction =
+ BottommostLevelCompaction::kSkip;
+ // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+ // and will skip bottommost level compaction
+ ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+ ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+ ASSERT_EQ(trivial_move, 3);
+ ASSERT_EQ(non_trivial_move, 0);
+
+ for (int i = 0; i < 300; i++) {
+ ASSERT_EQ(Get(ShortKey(i)), values[i]);
+ }
+
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(RandomString(&rnd, kValueSize));
+
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ // index: 0 1 2 3 4 5 6 7 8 9
+ // size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
+ // score: 1.5 1.3 1.5 2.0 inf
+ //
+ // Files 0-4 will be included in an L0->L1 compaction.
+ //
+ // L0->L0 will be triggered since the sync points guarantee compaction to base
+ // level is still blocked when files 5-9 trigger another compaction.
+ //
+ // Files 6-9 are the longest span of available files for which
+ // work-per-deleted-file decreases (see "score" row above).
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(Put(Key(0), "")); // prevents trivial move
+ if (i == 5) {
+ ASSERT_OK(Put(Key(i + 1), value + value));
+ } else {
+ ASSERT_OK(Put(Key(i + 1), value));
+ }
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForCompact();
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
+ // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
+ ASSERT_EQ(2, level_to_files[0].size());
+ ASSERT_GT(level_to_files[1].size(), 0);
+ for (int i = 0; i < 2; ++i) {
+ ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
+ }
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
+ // regression test for issue #2722: L0->L0 compaction can resurrect deleted
+ // keys from older L0 files if L1+ files' key-ranges do not include the key.
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = 5;
+ options.max_background_compactions = 2;
+ options.max_subcompactions = max_subcompactions_;
+ DestroyAndReopen(options);
+
+ const size_t kValueSize = 1 << 20;
+ Random rnd(301);
+ std::string value(RandomString(&rnd, kValueSize));
+
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompactionBySize:0",
+ "CompactionJob::Run():Start"}});
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ // index: 0 1 2 3 4 5 6 7 8 9
+ // size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB
+ // score: 1.25 1.33 1.5 2.0 inf
+ //
+ // Files 0-4 will be included in an L0->L1 compaction.
+ //
+ // L0->L0 will be triggered since the sync points guarantee compaction to base
+ // level is still blocked when files 5-9 trigger another compaction. All files
+ // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
+ //
+ // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
+ // L0->L0 preserves the deletion such that the key remains deleted.
+ for (int i = 0; i < 10; ++i) {
+ // key 0 serves both to prevent trivial move and as the key we want to
+ // verify is not resurrected by L0->L0 compaction.
+ if (i < 5) {
+ ASSERT_OK(Put(Key(0), ""));
+ } else {
+ ASSERT_OK(Delete(Key(0)));
+ }
+ ASSERT_OK(Put(Key(i + 1), value));
+ ASSERT_OK(Flush());
+ }
+ dbfull()->TEST_WaitForCompact();
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+ std::vector<std::vector<FileMetaData>> level_to_files;
+ dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+ &level_to_files);
+ ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1
+ // L0 has a single output file from L0->L0
+ ASSERT_EQ(1, level_to_files[0].size());
+ ASSERT_GT(level_to_files[1].size(), 0);
+ ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
+
+ ReadOptions roptions;
+ std::string result;
+ ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
+}
+
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+ const int kNumFilesTrigger = 3;
+ Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+ for (bool use_universal_compaction : {false, true}) {
+ Options options = CurrentOptions();
+ if (use_universal_compaction) {
+ options.compaction_style = kCompactionStyleUniversal;
+ } else {
+ options.compaction_style = kCompactionStyleLevel;
+ options.level_compaction_dynamic_level_bytes = true;
+ }
+ options.num_levels = 4;
+ options.write_buffer_size = 100 << 10; // 100KB
+ options.target_file_size_base = 32 << 10; // 32KB
+ options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+ // Trigger compaction if size amplification exceeds 110%
+ options.compaction_options_universal.max_size_amplification_percent = 110;
+ DestroyAndReopen(options);
+
+ int num_bottom_pri_compactions = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BGWorkBottomCompaction",
+ [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int num = 0; num < kNumFilesTrigger; num++) {
+ ASSERT_EQ(NumSortedRuns(), num);
+ int key_idx = 0;
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ ASSERT_EQ(1, num_bottom_pri_compactions);
+
+ // Verify that size amplification did occur
+ ASSERT_EQ(NumSortedRuns(), 1);
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+ }
+ Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+ // Deletions can be dropped when compacted to non-last level if they fall
+ // outside the lower-level files' key-ranges.
+ const int kNumL0Files = 4;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files;
+ options.statistics = rocksdb::CreateDBStatistics();
+ DestroyAndReopen(options);
+
+ // put key 1 and 3 in separate L1, L2 files.
+ // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+ for (int level = 2; level >= 1; --level) {
+ for (int i = 0; i < 2; ++i) {
+ Put(Key(2 * i + 1), "val");
+ Flush();
+ }
+ MoveFilesToLevel(level);
+ ASSERT_EQ(2, NumTableFilesAtLevel(level));
+ }
+
+ // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+ // - Tombstones for keys 2 and 4 can be dropped early.
+ // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+ for (int i = 0; i < kNumL0Files; ++i) {
+ Put(Key(0), "val"); // sentinel to prevent trivial move
+ Delete(Key(i + 1));
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+
+ for (int i = 0; i < kNumL0Files; ++i) {
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+ }
+ ASSERT_EQ(2, options.statistics->getTickerCount(
+ COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+ ASSERT_EQ(2,
+ options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+ // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+ // CompactFiles() had a bug where it failed to pick a compaction when an L0
+ // compaction existed, but marked it as scheduled anyways. It'd never be
+ // unmarked as scheduled, so future compactions or DB close could hang.
+ const int kNumL0Files = 5;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+ options.max_background_compactions = 2;
+ DestroyAndReopen(options);
+
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"LevelCompactionPicker::PickCompaction:Return",
+ "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+ {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+ "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ auto schedule_multi_compaction_token =
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+ // Files 0-3 will be included in an L0->L1 compaction.
+ //
+ // File 4 will be included in a call to CompactFiles() while the first
+ // compaction is running.
+ for (int i = 0; i < kNumL0Files - 1; ++i) {
+ ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move
+ ASSERT_OK(Put(Key(i + 1), "val"));
+ ASSERT_OK(Flush());
+ }
+ TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+ // file 4 flushed after 0-3 picked
+ ASSERT_OK(Put(Key(kNumL0Files), "val"));
+ ASSERT_OK(Flush());
+
+ // previously DB close would hang forever as this situation caused scheduled
+ // compactions count to never decrement to zero.
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+ std::vector<std::string> input_filenames;
+ input_filenames.push_back(cf_meta.levels[0].files.front().name);
+ ASSERT_OK(dbfull()
+ ->CompactFiles(CompactionOptions(), input_filenames,
+ 0 /* output_level */));
+ TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+ // Regression test for bug of not pulling in L0 files that overlap the user-
+ // specified input files in time- and key-ranges.
+ Put(Key(0), "old_val");
+ Flush();
+ Put(Key(0), "new_val");
+ Flush();
+
+ ColumnFamilyMetaData cf_meta;
+ dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+ ASSERT_GE(cf_meta.levels.size(), 2);
+ ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+ // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+ // overlaps in key-range and time-range.
+ std::vector<std::string> input_filenames;
+ input_filenames.push_back(cf_meta.levels[0].files.front().name);
+ ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+ 1 /* output_level */));
+ ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+ // bottom-level files may contain deletions due to snapshots protecting the
+ // deleted keys. Once the snapshot is released, we should see files with many
+ // such deletions undergo single-file compactions.
+ const int kNumKeysPerFile = 1024;
+ const int kNumLevelFiles = 4;
+ const int kValueSize = 128;
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.level0_file_num_compaction_trigger = kNumLevelFiles;
+ // inflate it a bit to account for key/metadata overhead
+ options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+ CreateAndReopenWithCF({"one"}, options);
+
+ Random rnd(301);
+ const Snapshot* snapshot = nullptr;
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ if (i == kNumLevelFiles - 1) {
+ snapshot = db_->GetSnapshot();
+ // delete every other key after grabbing a snapshot, so these deletions
+ // and the keys they cover can't be dropped until after the snapshot is
+ // released.
+ for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+ ASSERT_OK(Delete(Key(j)));
+ }
+ }
+ Flush();
+ if (i < kNumLevelFiles - 1) {
+ ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+ }
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+ std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+ db_->GetLiveFilesMetaData(&pre_release_metadata);
+ // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+ // files does not need to be preserved in case of a future snapshot.
+ ASSERT_OK(Put(Key(0), "val"));
+ ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+ // release snapshot and wait for compactions to finish. Single-file
+ // compactions should be triggered, which reduce the size of each bottom-level
+ // file without changing file count.
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() ==
+ CompactionReason::kBottommostFiles);
+ });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+ dbfull()->TEST_WaitForCompact();
+ db_->GetLiveFilesMetaData(&post_release_metadata);
+ ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+ for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+ const auto& pre_file = pre_release_metadata[i];
+ const auto& post_file = post_release_metadata[i];
+ ASSERT_EQ(1, pre_file.level);
+ ASSERT_EQ(1, post_file.level);
+ // each file is smaller than it was before as it was rewritten without
+ // deletion markers/deleted keys.
+ ASSERT_LT(post_file.size, pre_file.size);
+ }
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+ const int kNumKeysPerFile = 32;
+ const int kNumLevelFiles = 2;
+ const int kValueSize = 1024;
+
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ options.ttl = 24 * 60 * 60; // 24 hours
+ options.max_open_files = -1;
+ env_->time_elapse_only_sleep_ = false;
+ options.env = env_;
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(3);
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+ // Delete previously written keys.
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("2,0,0,2", FilesPerLevel());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ env_->addon_time_.fetch_add(36 * 60 * 60); // 36 hours
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ // Just do a simple write + flush so that the Ttl expired files get
+ // compacted.
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+ });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+ dbfull()->TEST_WaitForCompact();
+ // All non-L0 files are deleted, as they contained only deleted data.
+ ASSERT_EQ("1", FilesPerLevel());
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+ // Test dynamically changing ttl.
+
+ env_->addon_time_.store(0);
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(
+ Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ MoveFilesToLevel(3);
+ ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+ // Delete previously written keys.
+ for (int i = 0; i < kNumLevelFiles; ++i) {
+ for (int j = 0; j < kNumKeysPerFile; ++j) {
+ ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+ }
+ Flush();
+ }
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("2,0,0,2", FilesPerLevel());
+ MoveFilesToLevel(1);
+ ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+ // Move time forward by 12 hours, and make sure that compaction still doesn't
+ // trigger as ttl is set to 24 hours.
+ env_->addon_time_.fetch_add(12 * 60 * 60);
+ ASSERT_OK(Put("a", "1"));
+ Flush();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+ Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+ ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+ });
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Dynamically change ttl to 10 hours.
+ // This should trigger a ttl compaction, as 12 hours have already passed.
+ ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+ dbfull()->TEST_WaitForCompact();
+ // All non-L0 files are deleted, as they contained only deleted data.
+ ASSERT_EQ("1", FilesPerLevel());
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+ // compaction only triggers flush after it's sure stall won't be triggered for
+ // L0 file count going too high.
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ // i == 0: verifies normal case where stall is avoided by delay
+ // i == 1: verifies no delay in edge case where stall trigger is same as
+ // compaction trigger, so stall can't be avoided
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ if (i == 0) {
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ } else {
+ options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+ }
+ Reopen(options);
+
+ if (i == 0) {
+ // ensure the auto compaction doesn't finish until manual compaction has
+ // had a chance to be delayed.
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "CompactionJob::Run():End"}});
+ } else {
+ // ensure the auto-compaction doesn't finish until manual compaction has
+ // continued without delay.
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+ }
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024)));
+ }
+ Flush();
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ db_->CompactRange(cro, nullptr, nullptr);
+ });
+
+ manual_compaction_thread.join();
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+ // compaction only triggers flush after it's sure stall won't be triggered for
+ // immutable memtable count going too high.
+ const int kNumImmMemTableLimit = 8;
+ // i == 0: verifies normal case where stall is avoided by delay
+ // i == 1: verifies no delay in edge case where stall trigger is same as flush
+ // trigger, so stall can't be avoided
+ for (int i = 0; i < 2; ++i) {
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ // the delay limit is one less than the stop limit. This test focuses on
+ // avoiding delay limit, but this option sets stop limit, so add one.
+ options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+ if (i == 1) {
+ options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+ }
+ Reopen(options);
+
+ if (i == 0) {
+ // ensure the flush doesn't finish until manual compaction has had a
+ // chance to be delayed.
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "FlushJob::WriteLevel0Table"}});
+ } else {
+ // ensure the flush doesn't finish until manual compaction has continued
+ // without delay.
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::FlushMemTable:StallWaitDone",
+ "FlushJob::WriteLevel0Table"}});
+ }
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+ ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+ FlushOptions flush_opts;
+ flush_opts.wait = false;
+ flush_opts.allow_write_stall = true;
+ dbfull()->Flush(flush_opts);
+ }
+
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ db_->CompactRange(cro, nullptr, nullptr);
+ });
+
+ manual_compaction_thread.join();
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(0, NumTableFilesAtLevel(0));
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+ // does not hang if CF is dropped or DB is closed
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+ // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+ // simulate what happens during Close as we can't call Close (it
+ // blocks on the auto-compaction, making a cycle).
+ for (int i = 0; i < 2; ++i) {
+ CreateAndReopenWithCF({"one"}, options);
+ // The calls to close CF/DB wait until the manual compaction stalls.
+ // The auto-compaction waits until the manual compaction finishes to ensure
+ // the signal comes from closing CF/DB, not from compaction making progress.
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+ {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+ "CompactionJob::Run():End"}});
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ Random rnd(301);
+ for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024)));
+ }
+ Flush(1);
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+ .IsShutdownInProgress());
+ });
+
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+ if (i == 0) {
+ ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+ } else {
+ dbfull()->CancelAllBackgroundWork(false /* wait */);
+ }
+ manual_compaction_thread.join();
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+ dbfull()->TEST_WaitForCompact();
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+ // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+ // CompactRange skips its flush if the delay is long enough that the memtables
+ // existing at the beginning of the call have already been flushed.
+ const int kNumL0FilesTrigger = 4;
+ const int kNumL0FilesLimit = 8;
+ Options options = CurrentOptions();
+ options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+ options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+ Reopen(options);
+
+ Random rnd(301);
+ // The manual flush includes the memtable that was active when CompactRange
+ // began. So it unblocks CompactRange and precludes its flush. Throughout the
+ // test, stall conditions are upheld via high L0 file count.
+ rocksdb::SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+ "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+ {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+ "DBImpl::FlushMemTable:StallWaitDone"},
+ {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ //used for the delayable flushes
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+ }
+ dbfull()->Flush(flush_opts);
+ }
+ auto manual_compaction_thread = port::Thread([this]() {
+ CompactRangeOptions cro;
+ cro.allow_write_stall = false;
+ db_->CompactRange(cro, nullptr, nullptr);
+ });
+
+ TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+ Put(ToString(0), RandomString(&rnd, 1024));
+ dbfull()->Flush(flush_opts);
+ Put(ToString(0), RandomString(&rnd, 1024));
+ TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+ manual_compaction_thread.join();
+
+ // If CompactRange's flush was skipped, the final Put above will still be
+ // in the active memtable.
+ std::string num_keys_in_memtable;
+ db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable);
+ ASSERT_EQ(ToString(1), num_keys_in_memtable);
+
+ rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+ // Verify memtable only gets flushed if it contains data overlapping the range
+ // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+ const int kNumEndpointKeys = 5;
+ std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+ Options options = CurrentOptions();
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ // One extra iteration for nullptr, which means left side of interval is
+ // unbounded.
+ for (int i = 0; i <= kNumEndpointKeys; ++i) {
+ Slice begin;
+ Slice* begin_ptr;
+ if (i == 0) {
+ begin_ptr = nullptr;
+ } else {
+ begin = keys[i - 1];
+ begin_ptr = &begin;
+ }
+ // Start at `i` so right endpoint comes after left endpoint. One extra
+ // iteration for nullptr, which means right side of interval is unbounded.
+ for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+ Slice end;
+ Slice* end_ptr;
+ if (j == kNumEndpointKeys) {
+ end_ptr = nullptr;
+ } else {
+ end = keys[j];
+ end_ptr = &end;
+ }
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Put("d", "val"));
+ CompactRangeOptions compact_range_opts;
+ ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+ uint64_t get_prop_tmp, num_memtable_entries = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+ &get_prop_tmp));
+ num_memtable_entries += get_prop_tmp;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+ &get_prop_tmp));
+ num_memtable_entries += get_prop_tmp;
+ if (begin_ptr == nullptr || end_ptr == nullptr ||
+ (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+ // In this case `CompactRange`'s range overlapped in some way with the
+ // memtable's range, so flush should've happened. Then "b" and "d" won't
+ // be in the memtable.
+ ASSERT_EQ(0, num_memtable_entries);
+ } else {
+ ASSERT_EQ(2, num_memtable_entries);
+ // flush anyways to prepare for next iteration
+ db_->Flush(FlushOptions());
+ }
+ }
+ }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+ Options options = CurrentOptions();
+ options.level0_file_num_compaction_trigger = 2;
+ CompactionStatsCollector* collector = new CompactionStatsCollector();
+ options.listeners.emplace_back(collector);
+ DestroyAndReopen(options);
+
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 5000; j++) {
+ Put(std::to_string(j), std::string(1, 'A'));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ }
+ dbfull()->TEST_WaitForCompact();
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+ ColumnFamilyData* cfd = cfh->cfd();
+
+ VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+ // LSM setup:
+ // L1: [ba bz]
+ // L2: [a b] [c d]
+ // L3: [a b] [c d]
+ //
+ // Thread 1: Thread 2:
+ // Begin compacting all L2->L3
+ // Compact [ba bz] L1->L3
+ // End compacting all L2->L3
+ //
+ // The compaction operation in thread 2 should be disallowed because the range
+ // overlaps with the compaction in thread 1, which also covers that range in
+ // L3.
+ Options options = CurrentOptions();
+ FlushedFileCollector* collector = new FlushedFileCollector();
+ options.listeners.emplace_back(collector);
+ Reopen(options);
+
+ for (int level = 3; level >= 2; --level) {
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("c", "val"));
+ ASSERT_OK(Put("d", "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(level);
+ }
+ ASSERT_OK(Put("ba", "val"));
+ ASSERT_OK(Put("bz", "val"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"CompactFilesImpl:0",
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+ {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+ "CompactFilesImpl:1"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ auto bg_thread = port::Thread([&]() {
+ // Thread 1
+ std::vector<std::string> filenames = collector->GetFlushedFiles();
+ filenames.pop_back();
+ ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+ 3 /* output_level */));
+ });
+
+ // Thread 2
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+ std::string filename = collector->GetFlushedFiles().back();
+ ASSERT_FALSE(
+ db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+ .ok());
+ TEST_SYNC_POINT(
+ "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+ bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+ Options options = CurrentOptions();
+ SstStatsCollector* collector = new SstStatsCollector();
+ options.level0_file_num_compaction_trigger = 2;
+ options.listeners.emplace_back(collector);
+ Reopen(options);
+
+ // Make sure the L0 files overlap to prevent trivial move.
+ ASSERT_OK(Put("a", "val"));
+ ASSERT_OK(Put("b", "val"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Delete("a"));
+ ASSERT_OK(Delete("b"));
+ ASSERT_OK(Flush());
+
+ dbfull()->TEST_WaitForCompact();
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+ // Expect one file creation to start for each flush, and zero for compaction
+ // since no keys are written.
+ ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+ const int kNumKeysPerFile = 10;
+ const int kMaxBackgroundThreads = 64;
+
+ struct CompactionLimiter {
+ std::string name;
+ int limit_tasks;
+ int max_tasks;
+ int tasks;
+ std::shared_ptr<ConcurrentTaskLimiter> limiter;
+ };
+
+ std::vector<CompactionLimiter> limiter_settings;
+ limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+ limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+ limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+ for (auto& ls : limiter_settings) {
+ ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+ }
+
+ std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+ NewConcurrentTaskLimiter("unique_limiter", -1));
+
+ const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5",
+ "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
+ const int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+ std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+ Options options = CurrentOptions();
+ options.write_buffer_size = 110 * 1024; // 110KB
+ options.arena_block_size = 4096;
+ options.num_levels = 3;
+ options.level0_file_num_compaction_trigger = 4;
+ options.level0_slowdown_writes_trigger = 64;
+ options.level0_stop_writes_trigger = 64;
+ options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ options.max_write_buffer_number = 10; // Enough memtables
+ DestroyAndReopen(options);
+
+ std::vector<Options> option_vector;
+ option_vector.reserve(cf_count);
+
+ for (int cf = 0; cf < cf_count; cf++) {
+ ColumnFamilyOptions cf_opt(options);
+ if (cf == 0) {
+ // "Default" CF does't use compaction limiter
+ cf_opt.compaction_thread_limiter = nullptr;
+ } else if (cf == 1) {
+ // "1" CF uses bypass compaction limiter
+ unique_limiter->SetMaxOutstandingTask(-1);
+ cf_opt.compaction_thread_limiter = unique_limiter;
+ } else {
+ // Assign limiter by mod
+ auto& ls = limiter_settings[cf % 3];
+ cf_opt.compaction_thread_limiter = ls.limiter;
+ cf_to_limiter[cf_names[cf]] = &ls;
+ }
+ option_vector.emplace_back(DBOptions(options), cf_opt);
+ }
+
+ for (int cf = 1; cf < cf_count; cf++) {
+ CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+ }
+
+ ReopenWithColumnFamilies(std::vector<std::string>(cf_names,
+ cf_names + cf_count),
+ option_vector);
+
+ port::Mutex mutex;
+
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+ const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+ auto iter = cf_to_limiter.find(cf_name);
+ if (iter != cf_to_limiter.end()) {
+ MutexLock l(&mutex);
+ ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+ iter->second->max_tasks = std::max(iter->second->max_tasks,
+ iter->second->limit_tasks);
+ }
+ });
+
+ rocksdb::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+ const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+ auto iter = cf_to_limiter.find(cf_name);
+ if (iter != cf_to_limiter.end()) {
+ MutexLock l(&mutex);
+ ASSERT_GE(--iter->second->tasks, 0);
+ }
+ });
+
+ rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Block all compact threads in thread pool.
+ const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+ const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+ env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+ env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+ test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+ // Block all compaction threads in thread pool.
+ for (size_t i = 0; i < kTotalCompactTasks; i++) {
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ &sleeping_compact_tasks[i], Env::LOW);
+ sleeping_compact_tasks[i].WaitUntilSleeping();
+ }
+
+ int keyIndex = 0;
+
+ for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+ for (int cf = 0; cf < cf_count; cf++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf, "", ""));
+ }
+
+ for (int cf = 0; cf < cf_count; cf++) {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ }
+ }
+
+ // Enough L0 files to trigger compaction
+ for (int cf = 0; cf < cf_count; cf++) {
+ ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+ options.level0_file_num_compaction_trigger);
+ }
+
+ // Create more files for one column family, which triggers speed up
+ // condition, all compactions will be scheduled.
+ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(0, Key(i), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(0, "", ""));
+ dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+ ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+ NumTableFilesAtLevel(0, 0));
+ }
+
+ // All CFs are pending compaction
+ ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+ // Unblock all compaction threads
+ for (size_t i = 0; i < kTotalCompactTasks; i++) {
+ sleeping_compact_tasks[i].WakeUp();
+ sleeping_compact_tasks[i].WaitUntilDone();
+ }
+
+ for (int cf = 0; cf < cf_count; cf++) {
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+ }
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Max outstanding compact tasks reached limit
+ for (auto& ls : limiter_settings) {
+ ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+ ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+ }
+
+ // test manual compaction under a fully throttled limiter
+ int cf_test = 1;
+ unique_limiter->SetMaxOutstandingTask(0);
+
+ // flush one more file to cf 1
+ for (int i = 0; i < kNumKeysPerFile; i++) {
+ ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+ }
+ // put extra key to trigger flush
+ ASSERT_OK(Put(cf_test, "", ""));
+
+ dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]);
+ ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+ Compact(cf_test, Key(0), Key(keyIndex));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+ ::testing::Values(std::make_tuple(1, true),
+ std::make_tuple(1, false),
+ std::make_tuple(4, true),
+ std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+ Options options = CurrentOptions();
+ Destroy(options);
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.use_direct_io_for_flush_and_compaction = GetParam();
+ options.env = new MockEnv(Env::Default());
+ Reopen(options);
+ bool readahead = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "TableCache::NewIterator:for_compaction", [&](void* arg) {
+ bool* use_direct_reads = static_cast<bool*>(arg);
+ ASSERT_EQ(*use_direct_reads,
+ options.use_direct_reads);
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+ bool* use_direct_writes = static_cast<bool*>(arg);
+ ASSERT_EQ(*use_direct_writes,
+ options.use_direct_io_for_flush_and_compaction);
+ });
+ if (options.use_direct_io_for_flush_and_compaction) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "SanitizeOptions:direct_io", [&](void* /*arg*/) {
+ readahead = true;
+ });
+ }
+ SyncPoint::GetInstance()->EnableProcessing();
+ CreateAndReopenWithCF({"pikachu"}, options);
+ MakeTables(3, "p", "q", 1);
+ ASSERT_EQ("1,1,1", FilesPerLevel(1));
+ Compact(1, "p1", "p9");
+ ASSERT_EQ(readahead, options.use_direct_reads);
+ ASSERT_EQ("0,0,1", FilesPerLevel(1));
+ Destroy(options);
+ delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+ testing::Bool());
+
+class CompactionPriTest : public DBTestBase,
+ public testing::WithParamInterface<uint32_t> {
+ public:
+ CompactionPriTest() : DBTestBase("/compaction_pri_test") {
+ compaction_pri_ = GetParam();
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ uint32_t compaction_pri_;
+};
+
+TEST_P(CompactionPriTest, Test) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 16 * 1024;
+ options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
+ options.hard_pending_compaction_bytes_limit = 256 * 1024;
+ options.max_bytes_for_level_base = 64 * 1024;
+ options.max_bytes_for_level_multiplier = 4;
+ options.compression = kNoCompression;
+
+ DestroyAndReopen(options);
+
+ Random rnd(301);
+ const int kNKeys = 5000;
+ int keys[kNKeys];
+ for (int i = 0; i < kNKeys; i++) {
+ keys[i] = i;
+ }
+ std::random_shuffle(std::begin(keys), std::end(keys));
+
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102)));
+ }
+
+ dbfull()->TEST_WaitForCompact();
+ for (int i = 0; i < kNKeys; i++) {
+ ASSERT_NE("NOT_FOUND", Get(Key(i)));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ CompactionPriTest, CompactionPriTest,
+ ::testing::Values(CompactionPri::kByCompensatedSize,
+ CompactionPri::kOldestLargestSeqFirst,
+ CompactionPri::kOldestSmallestSeqFirst,
+ CompactionPri::kMinOverlappingRatio));
+
+class NoopMergeOperator : public MergeOperator {
+ public:
+ NoopMergeOperator() {}
+
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* merge_out) const override {
+ std::string val("bar");
+ merge_out->new_value = val;
+ return true;
+ }
+
+ const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+ Options opts = CurrentOptions();
+ opts.num_levels = 3;
+ opts.level0_file_num_compaction_trigger = 10;
+ opts.compression = kNoCompression;
+ opts.merge_operator.reset(new NoopMergeOperator());
+ opts.target_file_size_base = 10240;
+ DestroyAndReopen(opts);
+
+ Random rnd(301);
+ for (auto i = 0; i < 8; ++i) {
+ for (auto j = 0; j < 10; ++j) {
+ Merge("foo", RandomString(&rnd, 1024));
+ }
+ Flush();
+ }
+
+ MoveFilesToLevel(2);
+
+ std::string prop;
+ EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+ uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+ ASSERT_OK(dbfull()->SetOptions(
+ {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ dbfull()->CompactRange(cro, nullptr, nullptr);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+ // Regression test for bug where manual compaction hangs forever when the DB
+ // is in read-only mode. Verify it now at least returns, despite failing.
+ const int kNumL0Files = 4;
+ std::unique_ptr<FaultInjectionTestEnv> mock_env(
+ new FaultInjectionTestEnv(Env::Default()));
+ Options opts = CurrentOptions();
+ opts.disable_auto_compactions = true;
+ opts.env = mock_env.get();
+ DestroyAndReopen(opts);
+
+ Random rnd(301);
+ for (int i = 0; i < kNumL0Files; ++i) {
+ // Make sure files are overlapping in key-range to prevent trivial move.
+ Put("key1", RandomString(&rnd, 1024));
+ Put("key2", RandomString(&rnd, 1024));
+ Flush();
+ }
+ ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+ // Enter read-only mode by failing a write.
+ mock_env->SetFilesystemActive(false);
+ // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+ // early trying to flush memtable.
+ ASSERT_NOK(Put("key3", RandomString(&rnd, 1024)));
+
+ // In the bug scenario, the first manual compaction would fail and forget to
+ // unregister itself, causing the second one to hang forever due to conflict
+ // with a non-running compaction.
+ CompactRangeOptions cro;
+ cro.exclusive_manual_compaction = false;
+ Slice begin_key("key1");
+ Slice end_key("key2");
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+ ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+ // Close before mock_env destruct.
+ Close();
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+ const int kNumKeysPerFile = 100;
+ // Generate SST files.
+ Options options = CurrentOptions();
+
+ // Generate an external SST file containing a single key, i.e. 99
+ std::string sst_files_dir = dbname_ + "/sst_files/";
+ test::DestroyDir(env_, sst_files_dir);
+ ASSERT_OK(env_->CreateDir(sst_files_dir));
+ SstFileWriter sst_writer(EnvOptions(), options);
+ const std::string sst_file_path = sst_files_dir + "test.sst";
+ ASSERT_OK(sst_writer.Open(sst_file_path));
+ ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+ ASSERT_OK(sst_writer.Finish());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency({
+ {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+ "BackgroundCallCompaction:0"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ options.write_buffer_size = 110 << 10; // 110KB
+ options.level0_file_num_compaction_trigger =
+ options.level0_stop_writes_trigger;
+ options.max_subcompactions = max_subcompactions_;
+ options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+ DestroyAndReopen(options);
+ Random rnd(301);
+
+ // Generate level0_stop_writes_trigger L0 files to trigger write stop
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ for (int j = 0; j != kNumKeysPerFile; ++j) {
+ ASSERT_OK(Put(Key(j), RandomString(&rnd, 990)));
+ }
+ if (0 == i) {
+ // When we reach here, the memtables have kNumKeysPerFile keys. Note that
+ // flush is not yet triggered. We need to write an extra key so that the
+ // write path will call PreprocessWrite and flush the previous key-value
+ // pairs to e flushed. After that, there will be the newest key in the
+ // memtable, and a bunch of L0 files. Since there is already one key in
+ // the memtable, then for i = 1, 2, ..., we do not have to write this
+ // extra key to trigger flush.
+ ASSERT_OK(Put("", ""));
+ }
+ dbfull()->TEST_WaitForFlushMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
+ }
+ // When we reach this point, there will be level0_stop_writes_trigger L0
+ // files and one extra key (99) in memory, which overlaps with the external
+ // SST file. Write stall triggers, and can be cleared only after compaction
+ // reduces the number of L0 files.
+
+ // Compaction will also be triggered since we have reached the threshold for
+ // auto compaction. Note that compaction may begin after the following file
+ // ingestion thread and waits for ingestion to finish.
+
+ // Thread to ingest file with overlapping key range with the current
+ // memtable. Consequently ingestion will trigger a flush. The flush MUST
+ // proceed without waiting for the write stall condition to clear, otherwise
+ // deadlock can happen.
+ port::Thread ingestion_thr([&]() {
+ IngestExternalFileOptions ifo;
+ Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+ ASSERT_OK(s);
+ });
+
+ // More write to trigger write stop
+ ingestion_thr.join();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Close();
+}
+
+#endif // !defined(ROCKSDB_LITE)
+} // namespace rocksdb
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+ rocksdb::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ (void) argc;
+ (void) argv;
+ return 0;
+#endif
+}