From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rocksdb/db/db_compaction_test.cc | 8227 ++++++++++++++++++++++++++++++++++ 1 file changed, 8227 insertions(+) create mode 100644 src/rocksdb/db/db_compaction_test.cc (limited to 'src/rocksdb/db/db_compaction_test.cc') diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc new file mode 100644 index 000000000..ba9c50b9a --- /dev/null +++ b/src/rocksdb/db/db_compaction_test.cc @@ -0,0 +1,8227 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/blob/blob_index.h" +#include "db/db_test_util.h" +#include "env/mock_env.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/experimental.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/utilities/convenience.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/concurrent_task_limiter_impl.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +// SYNC_POINT is not supported in released Windows mode. +#if !defined(ROCKSDB_LITE) + +class CompactionStatsCollector : public EventListener { + public: + CompactionStatsCollector() + : compaction_completed_( + static_cast(CompactionReason::kNumOfReasons)) { + for (auto& v : compaction_completed_) { + v.store(0); + } + } + + ~CompactionStatsCollector() override {} + + void OnCompactionCompleted(DB* /* db */, + const CompactionJobInfo& info) override { + int k = static_cast(info.compaction_reason); + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(k >= 0 && k < num_of_reasons); + compaction_completed_[k]++; + } + + void OnExternalFileIngested( + DB* /* db */, const ExternalFileIngestionInfo& /* info */) override { + int k = static_cast(CompactionReason::kExternalSstIngestion); + compaction_completed_[k]++; + } + + void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override { + int k = static_cast(CompactionReason::kFlush); + compaction_completed_[k]++; + } + + int NumberOfCompactions(CompactionReason reason) const { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + int k = static_cast(reason); + assert(k >= 0 && k < num_of_reasons); + return compaction_completed_.at(k).load(); + } + + private: + std::vector> compaction_completed_; +}; + +class DBCompactionTest : public DBTestBase { + public: + DBCompactionTest() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {} + + protected: + /* + * Verifies compaction stats of cfd are valid. + * + * For each level of cfd, its compaction stats are valid if + * 1) sum(stat.counts) == stat.count, and + * 2) stat.counts[i] == collector.NumberOfCompactions(i) + */ + void VerifyCompactionStats(ColumnFamilyData& cfd, + const CompactionStatsCollector& collector) { +#ifndef NDEBUG + InternalStats* internal_stats_ptr = cfd.internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + const int num_of_reasons = + static_cast(CompactionReason::kNumOfReasons); + std::vector counts(num_of_reasons, 0); + // Count the number of compactions caused by each CompactionReason across + // all levels. + for (const auto& stat : comp_stats) { + int sum = 0; + for (int i = 0; i < num_of_reasons; i++) { + counts[i] += stat.counts[i]; + sum += stat.counts[i]; + } + ASSERT_EQ(sum, stat.count); + } + // Verify InternalStats bookkeeping matches that of + // CompactionStatsCollector, assuming that all compactions complete. + for (int i = 0; i < num_of_reasons; i++) { + ASSERT_EQ(collector.NumberOfCompactions(static_cast(i)), + counts[i]); + } +#endif /* NDEBUG */ + } +}; + +class DBCompactionTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBCompactionTestWithParam() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; +}; + +class DBCompactionTestWithBottommostParam + : public DBTestBase, + public testing::WithParamInterface { + public: + DBCompactionTestWithBottommostParam() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + bottommost_level_compaction_ = GetParam(); + } + + BottommostLevelCompaction bottommost_level_compaction_; +}; + +class DBCompactionDirectIOTest : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + DBCompactionDirectIOTest() : DBCompactionTest() {} +}; + +// Param = true : target level is non-empty +// Param = false: level between target level and source level +// is not empty. +class ChangeLevelConflictsWithAuto + : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + ChangeLevelConflictsWithAuto() : DBCompactionTest() {} +}; + +// Param = true: grab the compaction pressure token (enable +// parallel compactions) +// Param = false: Not grab the token (no parallel compactions) +class RoundRobinSubcompactionsAgainstPressureToken + : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + RoundRobinSubcompactionsAgainstPressureToken() { + grab_pressure_token_ = GetParam(); + } + bool grab_pressure_token_; +}; + +class RoundRobinSubcompactionsAgainstResources + : public DBCompactionTest, + public ::testing::WithParamInterface> { + public: + RoundRobinSubcompactionsAgainstResources() { + total_low_pri_threads_ = std::get<0>(GetParam()); + max_compaction_limits_ = std::get<1>(GetParam()); + } + int total_low_pri_threads_; + int max_compaction_limits_; +}; + +namespace { +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + + void ClearFlushedFiles() { flushed_files_.clear(); } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +class SstStatsCollector : public EventListener { + public: + SstStatsCollector() : num_ssts_creation_started_(0) {} + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /* info */) override { + ++num_ssts_creation_started_; + } + + int num_ssts_creation_started() { return num_ssts_creation_started_; } + + private: + std::atomic num_ssts_creation_started_; +}; + +static const int kCDTValueSize = 1000; +static const int kCDTKeysPerBuffer = 4; +static const int kCDTNumLevels = 8; +Options DeletionTriggerOptions(Options options) { + options.compression = kNoCompression; + options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24); + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.num_levels = kCDTNumLevels; + options.level0_file_num_compaction_trigger = 1; + options.target_file_size_base = options.write_buffer_size * 2; + options.target_file_size_multiplier = 2; + options.max_bytes_for_level_base = + options.target_file_size_base * options.target_file_size_multiplier; + options.max_bytes_for_level_multiplier = 2; + options.disable_auto_compactions = false; + options.compaction_options_universal.max_size_amplification_percent = 100; + return options; +} + +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} + +// Identifies all files between level "min_level" and "max_level" +// which has overlapping key range with "input_file_meta". +void GetOverlappingFileNumbersForLevelCompaction( + const ColumnFamilyMetaData& cf_meta, const Comparator* comparator, + int min_level, int max_level, const SstFileMetaData* input_file_meta, + std::set* overlapping_file_names) { + std::set overlapping_files; + overlapping_files.insert(input_file_meta); + for (int m = min_level; m <= max_level; ++m) { + for (auto& file : cf_meta.levels[m].files) { + for (auto* included_file : overlapping_files) { + if (HaveOverlappingKeyRanges(comparator, *included_file, file)) { + overlapping_files.insert(&file); + overlapping_file_names->insert(file.name); + break; + } + } + } + } +} + +void VerifyCompactionResult( + const ColumnFamilyMetaData& cf_meta, + const std::set& overlapping_file_numbers) { +#ifndef NDEBUG + for (auto& level : cf_meta.levels) { + for (auto& file : level.files) { + assert(overlapping_file_numbers.find(file.name) == + overlapping_file_numbers.end()); + } + } +#endif +} + +const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta, + Random* rand, int* level = nullptr) { + auto file_id = rand->Uniform(static_cast(cf_meta.file_count)) + 1; + for (auto& level_meta : cf_meta.levels) { + if (file_id <= level_meta.files.size()) { + if (level != nullptr) { + *level = level_meta.level; + } + auto result = rand->Uniform(file_id); + return &(level_meta.files[result]); + } + file_id -= static_cast(level_meta.files.size()); + } + assert(false); + return nullptr; +} +} // anonymous namespace + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// All the TEST_P tests run once with sub_compactions disabled (i.e. +// options.max_subcompactions = 1) and once with it enabled +TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { + for (int tid = 0; tid < 3; ++tid) { + uint64_t db_size[2]; + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + + if (tid == 1) { + // the following only disable stats update in DB::Open() + // and should not affect the result of this test. + options.skip_stats_update_on_db_open = true; + } else if (tid == 2) { + // third pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } + + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 1024; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); + + if (options.compaction_style == kCompactionStyleUniversal) { + // Claim: in universal compaction none of the original data will remain + // once compactions settle. + // + // Proof: The compensated size of the file containing the most tombstones + // is enough on its own to trigger size amp compaction. Size amp + // compaction is a full compaction, so all tombstones meet the obsolete + // keys they cover. + ASSERT_EQ(0, db_size[1]); + } else { + // Claim: in level compaction at most `db_size[0] / 2` of the original + // data will remain once compactions settle. + // + // Proof: Assume the original data is all in the bottom level. If it were + // not, it would meet its tombstone sooner. The original data size is + // large enough to require fanout to bottom level to be greater than + // `max_bytes_for_level_multiplier == 2`. In the level just above, + // tombstones must cover less than `db_size[0] / 4` bytes since fanout >= + // 2 and file size is compensated by doubling the size of values we expect + // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in + // levels above must cover less than `db_size[0] / 8` bytes of original + // data, `db_size[0] / 16`, and so on. + ASSERT_GT(db_size[0] / 2, db_size[1]); + } + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(DBCompactionTest, SkipStatsUpdateTest) { + // This test verify UpdateAccumulatedStats is not on + // if options.skip_stats_update_on_db_open = true + // The test will need to be updated if the internal behavior changes. + + Options options = DeletionTriggerOptions(CurrentOptions()); + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + + ASSERT_OK(Flush()); + + Close(); + + int update_acc_stats_called = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionStorageInfo::UpdateAccumulatedStats", + [&](void* /* arg */) { ++update_acc_stats_called; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Reopen the DB with stats-update disabled + options.skip_stats_update_on_db_open = true; + options.max_open_files = 20; + Reopen(options); + + ASSERT_EQ(update_acc_stats_called, 0); + + // Repeat the reopen process, but this time we enable + // stats-update. + options.skip_stats_update_on_db_open = false; + Reopen(options); + + ASSERT_GT(update_acc_stats_called, 0); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, TestTableReaderForCompaction) { + Options options = CurrentOptions(); + options.env = env_; + options.max_open_files = 20; + options.level0_file_num_compaction_trigger = 3; + // Avoid many shards with small max_open_files, where as little as + // two table insertions could lead to an LRU eviction, depending on + // hash values. + options.table_cache_numshardbits = 2; + DestroyAndReopen(options); + Random rnd(301); + + int num_table_cache_lookup = 0; + int num_new_table_reader = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", [&](void* arg) { + assert(arg != nullptr); + bool no_io = *(reinterpret_cast(arg)); + if (!no_io) { + // filter out cases for table properties queries. + num_table_cache_lookup++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::GetTableReader:0", + [&](void* /*arg*/) { num_new_table_reader++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) { + ASSERT_OK(Put(Key(k), Key(k))); + ASSERT_OK(Put(Key(10 - k), "bar")); + if (k < options.level0_file_num_compaction_trigger - 1) { + num_table_cache_lookup = 0; + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // preloading iterator issues one table cache lookup and create + // a new table reader, if not preloaded. + int old_num_table_cache_lookup = num_table_cache_lookup; + ASSERT_GE(num_table_cache_lookup, 1); + ASSERT_EQ(num_new_table_reader, 1); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(k), Get(Key(k))); + // lookup iterator from table cache and no need to create a new one. + ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2); + ASSERT_EQ(num_new_table_reader, 0); + } + } + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Preloading iterator issues one table cache lookup and creates + // a new table reader. One file is created for flush and one for compaction. + // Compaction inputs make no table cache look-up for data/range deletion + // iterators + // May preload table cache too. + ASSERT_GE(num_table_cache_lookup, 2); + int old_num_table_cache_lookup2 = num_table_cache_lookup; + + // Create new iterator for: + // (1) 1 for verifying flush results + // (2) 1 for verifying compaction results. + // (3) New TableReaders will not be created for compaction inputs + ASSERT_EQ(num_new_table_reader, 2); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(1), Get(Key(1))); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5); + ASSERT_EQ(num_new_table_reader, 0); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // Only verifying compaction outputs issues one table cache lookup + // for both data block and range deletion block). + // May preload table cache too. + ASSERT_GE(num_table_cache_lookup, 1); + old_num_table_cache_lookup2 = num_table_cache_lookup; + // One for verifying compaction results. + // No new iterator created for compaction. + ASSERT_EQ(num_new_table_reader, 1); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(1), Get(Key(1))); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); + ASSERT_EQ(num_new_table_reader, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { + for (int tid = 0; tid < 2; ++tid) { + uint64_t db_size[3]; + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + + if (tid == 1) { + // second pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } + + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); + Close(); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + // insert relatively small amount of data to trigger auto compaction. + for (int k = 0; k < kTestSize / 10; ++k) { + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); + // this time we're expecting significant drop in size. + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. In addition to that, this + // test inserts `db_size[0] / 10` to push the tombstones into SST files and + // then through automatic compactions. So in total `3 * db_size[0] / 5` of + // the original data may remain. + ASSERT_GT(3 * db_size[0] / 5, db_size[2]); + } +} + +TEST_F(DBCompactionTest, CompactRangeBottomPri) { + ASSERT_OK(Put(Key(50), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(100), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(200), "")); + ASSERT_OK(Flush()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Put(Key(199), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(2), "")); + ASSERT_OK(Put(Key(199), "")); + ASSERT_OK(Flush()); + ASSERT_EQ("2,0,3", FilesPerLevel(0)); + + // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will + // be triggered. + // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool + // and one compact to L2 in bottom pri pool. + int low_pri_count = 0; + int bottom_pri_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) { + Env::Priority* pri = reinterpret_cast(arg); + // First time is low pri pool in the test case. + if (low_pri_count == 0 && bottom_pri_count == 0) { + ASSERT_EQ(Env::Priority::LOW, *pri); + } + if (*pri == Env::Priority::LOW) { + low_pri_count++; + } else { + bottom_pri_count++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(1, low_pri_count); + ASSERT_EQ(1, bottom_pri_count); + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // Recompact bottom most level uses bottom pool + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(1, low_pri_count); + ASSERT_EQ(2, bottom_pri_count); + + env_->SetBackgroundThreads(0, Env::Priority::BOTTOM); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + // Low pri pool is used if bottom pool has size 0. + ASSERT_EQ(2, low_pri_count); + ASSERT_EQ(2, bottom_pri_count); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { + uint64_t db_size[3]; + for (int test = 0; test < 2; ++test) { + Options options = DeletionTriggerOptions(CurrentOptions()); + options.skip_stats_update_on_db_open = (test == 0); + + env_->random_read_counter_.Reset(); + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(rnd.RandomString(kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // L1 and L2 can fit deletions iff size compensation does not take effect, + // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining + // files at or above L2 down to L3 to ensure obsolete data does not + // accidentally meet its tombstone above L3. This makes the final size more + // deterministic and easy to see whether size compensation for deletions + // took effect. + MoveFilesToLevel(3 /* level */); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + + env_->random_read_counter_.Reset(); + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); + Close(); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); + + if (options.skip_stats_update_on_db_open) { + // If update stats on DB::Open is disable, we don't expect + // deletion entries taking effect. + // + // The deletions are small enough to fit in L1 and L2, and obsolete keys + // were moved to L3+, so none of the original data should have been + // dropped. + ASSERT_LE(db_size[0], db_size[2]); + } else { + // Otherwise, we should see a significant drop in db size. + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. + ASSERT_GT(db_size[0] / 2, db_size[2]); + } + } +} + +TEST_P(DBCompactionTestWithParam, CompactionTrigger) { + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.max_subcompactions = max_subcompactions_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 100KB (100 values, each 1K) + for (int i = 0; i < kNumKeysPerFile; i++) { + values.push_back(rnd.RandomString(990)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < kNumKeysPerFile; i++) { + values.push_back(rnd.RandomString(990)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); +} + +TEST_F(DBCompactionTest, BGCompactionsAllowed) { + // Create several column families. Make compaction triggers in all of them + // and see number of compactions scheduled to be less than allowed. + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + // Should speed up compaction when there are 4 files. + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 20; + options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large + options.max_background_compactions = 3; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + + // Block all threads in thread pool. + const size_t kTotalTasks = 4; + env_->SetBackgroundThreads(4, Env::LOW); + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + for (size_t i = 0; i < kTotalTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + + CreateAndReopenWithCF({"one", "two", "three"}, options); + + Random rnd(301); + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Create two more files for one column family, which triggers speed up + // condition, three compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(2, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(2, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 2)); + } + ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Unblock all threads to unblock all compactions. + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify number of compactions allowed will come back to 1. + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } +} + +TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(rnd.RandomString(100000)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // Reopening moves updates to level-0 + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, MinorCompactionsHappen) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 10000; + CreateAndReopenWithCF({"pikachu"}, options); + + const int N = 500; + + int starting_num_tables = TotalTableFiles(1); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = TotalTableFiles(1); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile1) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile2) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(SingleDelete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, CompactionSstPartitioner) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(4)); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("aaaa1", "A2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A2", Get("aaaa1")); + ASSERT_EQ("B", Get("bbbb1")); +} + +TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 1; + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(4)); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A", Get("aaaa1")); + ASSERT_EQ("B", Get("bbbb1")); +} + +TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + // compaction options + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + compact_opt.output_file_size_limit = 4096; + const size_t key_len = + static_cast(compact_opt.output_file_size_limit) / 5; + + DestroyAndReopen(options); + + std::vector snaps; + + // create first file and flush to l0 + for (auto& key : {"1", "2", "3", "3", "3", "3"}) { + ASSERT_OK(Put(key, std::string(key_len, 'A'))); + snaps.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // create second file and flush to l0 + for (auto& key : {"3", "4", "5", "6", "7", "8"}) { + ASSERT_OK(Put(key, std::string(key_len, 'A'))); + snaps.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // move both files down to l1 + ASSERT_OK( + dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1)); + + // release snap so that first instance of key(3) can have seqId=0 + for (auto snap : snaps) { + dbfull()->ReleaseSnapshot(snap); + } + + // create 3 files in l0 so to trigger compaction + for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { + ASSERT_OK(Put("2", std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Put("", "")); +} + +TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) { + // github issue #2249 + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(options); + + // create two files in l1 that we can compact + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) { + ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A'))); + ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2); + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}})); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_EQ(2, cf_meta.levels[1].files.size()); + std::vector input_filenames; + for (const auto& sst_file : cf_meta.levels[1].files) { + input_filenames.push_back(sst_file.name); + } + + // note CompactionOptions::output_file_size_limit is unset. + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1)); +} + +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) { + do { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1")); + ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2")); + } while (ChangeOptions()); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { + int32_t trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int32_t num_keys = 80; + int32_t value_size = 100 * 1024; // 100 KB + + Random rnd(301); + std::vector values; + for (int i = 0; i < num_keys; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to L0 + Reopen(options); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1 + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + LiveFileMetaData level0_file = metadata[0]; // L0 file meta + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + + // Compaction will initiate a trivial move from L0 to L1 + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // File moved From L0 to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1 + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name); + ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size); + + for (int i = 0; i < num_keys; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + ASSERT_EQ(trivial_move, 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + // non overlapping ranges + std::vector> ranges = { + {100, 199}, {300, 399}, {0, 99}, {200, 299}, + {600, 699}, {400, 499}, {500, 550}, {551, 599}, + }; + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + + // Since data is non-overlapping we expect compaction to initiate + // a trivial move + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // We expect that all the files were trivially moved from L0 to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + trivial_move = 0; + non_trivial_move = 0; + values.clear(); + DestroyAndReopen(options); + // Same ranges as above but overlapping + ranges = { + {100, 199}, + {300, 399}, + {0, 99}, + {200, 299}, + {600, 699}, + {400, 499}, + {500, 560}, // this range overlap with the next + // one + {551, 599}, + }; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } + ASSERT_EQ(trivial_move, 0); + ASSERT_EQ(non_trivial_move, 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 300] + for (int32_t i = 0; i <= 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [600 => 700] + for (int32_t i = 600; i <= 700; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + for (int32_t i = 0; i <= 300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (int32_t i = 600; i <= 700; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_P(DBCompactionTestWithParam, PartialOverlappingL0) { + class SubCompactionEventListener : public EventListener { + public: + void OnSubcompactionCompleted(const SubcompactionJobInfo&) override { + sub_compaction_finished_++; + } + std::atomic sub_compaction_finished_{0}; + }; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.max_subcompactions = max_subcompactions_; + SubCompactionEventListener* listener = new SubCompactionEventListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + // For subcompactino to trigger, output level needs to be non-empty. + ASSERT_OK(Put("key", "")); + ASSERT_OK(Put("kez", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key", "")); + ASSERT_OK(Put("kez", "")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Ranges that are only briefly overlapping so that they won't be trivially + // moved but subcompaction ranges would only contain a subset of files. + std::vector> ranges = { + {100, 199}, {198, 399}, {397, 600}, {598, 800}, {799, 900}, {895, 999}, + }; + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // One file in L1 + + listener->sub_compaction_finished_ = 0; + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (max_subcompactions_ > 3) { + // RocksDB might not generate the exact number of sub compactions. + // Here we validate that at least subcompaction happened. + ASSERT_GT(listener->sub_compaction_finished_.load(), 2); + } + + // We expect that all the files were compacted to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 0), 1); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } +} + +TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + bool first = true; + // Purpose of dependencies: + // 4 -> 1: ensure the order of two non-trivial compactions + // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions + // are installed + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (first) { + first = false; + TEST_SYNC_POINT("DBCompaction::ManualPartial:4"); + TEST_SYNC_POINT("DBCompaction::ManualPartial:3"); + } else { // second non-trivial compaction + TEST_SYNC_POINT("DBCompaction::ManualPartial:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + options.target_file_size_base = 1 << 23; // 8 MB + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + // Trivial move the two non-overlapping files to level 6 + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 1 files in L0 + ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false)); + // 2 files in L6, 1 file in L5 + ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 6); + ASSERT_EQ(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::port::Thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + // First non-trivial compaction is triggered + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::ManualPartial:1"); + // file 4 [300 => 400) + for (int32_t i = 300; i <= 400; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 5 [400 => 500) + for (int32_t i = 400; i <= 500; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 6 [500 => 600) + for (int32_t i = 500; i <= 600; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + // Second non-trivial compaction is triggered + ASSERT_OK(Flush()); + + // Before two non-trivial compactions are installed, there are 3 files in L0 + ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0)); + TEST_SYNC_POINT("DBCompaction::ManualPartial:5"); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // After two non-trivial compactions are installed, there is 1 file in L6, and + // 1 file in L1 + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0)); + threads.join(); + + for (int32_t i = 0; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +// Disable as the test is flaky. +TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + bool first = true; + bool second = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"}, + {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (first) { + TEST_SYNC_POINT("DBCompaction::PartialFill:4"); + first = false; + TEST_SYNC_POINT("DBCompaction::PartialFill:3"); + } else if (second) { + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L2, 1 in L0 + ASSERT_EQ("1,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + // 2 files in L2, 1 in L1 + ASSERT_EQ("0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 2); + ASSERT_EQ(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::port::Thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::PartialFill:1"); + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size = static_cast(target_size * + options.max_bytes_for_level_multiplier); + } + + TEST_SYNC_POINT("DBCompaction::PartialFill:2"); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + threads.join(); + + for (int32_t i = 0; i < 4300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL", + "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"}, + {"DBImpl::WaitForPendingWrites:BeforeBlock", + "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}}); + + Options options = CurrentOptions(); + options.unordered_write = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("bar", "v1")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); }); + + TEST_SYNC_POINT( + "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + writer.join(); + ASSERT_EQ(Get("foo"), "v2"); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Reopen(options); + ASSERT_EQ(Get("foo"), "v2"); +} + +TEST_F(DBCompactionTest, DeleteFileRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + values[j] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size = static_cast(target_size * + options.max_bytes_for_level_multiplier); + } + + const size_t old_num_files = CountFiles(); + std::string begin_string = Key(1000); + std::string end_string = Key(2000); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + + int32_t deleted_count = 0; + for (int32_t i = 0; i < 4300; i++) { + if (i < 1000 || i > 2000) { + ASSERT_EQ(Get(Key(i)), values[i]); + } else { + ReadOptions roptions; + std::string result; + Status s = db_->Get(roptions, Key(i), &result); + ASSERT_TRUE(s.IsNotFound() || s.ok()); + if (s.IsNotFound()) { + deleted_count++; + } + } + } + ASSERT_GT(deleted_count, 0); + begin_string = Key(5000); + end_string = Key(6000); + Slice begin1(begin_string); + Slice end1(end_string); + // Try deleting files in range which contain no keys + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1)); + + // Push data from level 0 to level 1 to force all data to be deleted + // Note that we don't delete level 0 files + compact_options.change_level = true; + compact_options.target_level = 1; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); + + int32_t deleted_count2 = 0; + for (int32_t i = 0; i < 4300; i++) { + ReadOptions roptions; + std::string result; + ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound()); + deleted_count2++; + } + ASSERT_GT(deleted_count2, deleted_count); + const size_t new_num_files = CountFiles(); + ASSERT_GT(old_num_files, new_num_files); +} + +TEST_F(DBCompactionTest, DeleteFilesInRanges) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,10", FilesPerLevel(0)); + + // file [0 => 100), [200 => 300), ... [800, 900) + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,10", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,10", FilesPerLevel(0)); + + // Delete files in range [0, 299] (inclusive) + { + auto begin_str1 = Key(0), end_str1 = Key(100); + auto begin_str2 = Key(100), end_str2 = Key(200); + auto begin_str3 = Key(200), end_str3 = Key(299); + Slice begin1(begin_str1), end1(end_str1); + Slice begin2(begin_str2), end2(end_str2); + Slice begin3(begin_str3), end3(end_str3); + std::vector ranges; + ranges.push_back(RangePtr(&begin1, &end1)); + ranges.push_back(RangePtr(&begin2, &end2)); + ranges.push_back(RangePtr(&begin3, &end3)); + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ranges.data(), ranges.size())); + ASSERT_EQ("0,3,7", FilesPerLevel(0)); + + // Keys [0, 300) should not exist. + for (auto i = 0; i < 300; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 300; i < 1000; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + } + + // Delete files in range [600, 999) (exclusive) + { + auto begin_str1 = Key(600), end_str1 = Key(800); + auto begin_str2 = Key(700), end_str2 = Key(900); + auto begin_str3 = Key(800), end_str3 = Key(999); + Slice begin1(begin_str1), end1(end_str1); + Slice begin2(begin_str2), end2(end_str2); + Slice begin3(begin_str3), end3(end_str3); + std::vector ranges; + ranges.push_back(RangePtr(&begin1, &end1)); + ranges.push_back(RangePtr(&begin2, &end2)); + ranges.push_back(RangePtr(&begin3, &end3)); + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ranges.data(), ranges.size(), false)); + ASSERT_EQ("0,1,4", FilesPerLevel(0)); + + // Keys [600, 900) should not exist. + for (auto i = 600; i < 900; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 300; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 900; i < 1000; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + } + + // Delete all files. + { + RangePtr range; + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1)); + ASSERT_EQ("", FilesPerLevel(0)); + + for (auto i = 0; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + } +} + +TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { + // regression test for #2833: groups of files whose user-keys overlap at the + // endpoints could be split by `DeleteFilesInRange`. This caused old data to + // reappear, either because a new version of the key was removed, or a range + // deletion was partially dropped. It could also cause non-overlapping + // invariant to be violated if the files dropped by DeleteFilesInRange were + // a subset of files that a range deletion spans. + const int kNumL0Files = 2; + const int kValSize = 8 << 10; // 8KB + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + // The snapshot prevents key 1 from having its old version dropped. The low + // `target_file_size_base` ensures two keys will be in each output file. + const Snapshot* snapshot = nullptr; + Random rnd(301); + // The value indicates which flush the key belonged to, which is enough + // for us to determine the keys' relative ages. After L0 flushes finish, + // files look like: + // + // File 0: 0 -> vals[0], 1 -> vals[0] + // File 1: 1 -> vals[1], 2 -> vals[1] + // + // Then L0->L1 compaction happens, which outputs keys as follows: + // + // File 0: 0 -> vals[0], 1 -> vals[1] + // File 1: 1 -> vals[0], 2 -> vals[1] + // + // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that + // would cause `1 -> vals[0]` (an older key) to reappear. + std::string vals[kNumL0Files]; + for (int i = 0; i < kNumL0Files; ++i) { + vals[i] = rnd.RandomString(kValSize); + ASSERT_OK(Put(Key(i), vals[i])); + ASSERT_OK(Put(Key(i + 1), vals[i])); + ASSERT_OK(Flush()); + if (i == 0) { + snapshot = db_->GetSnapshot(); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify `DeleteFilesInRange` can't drop only file 0 which would cause + // "1 -> vals[0]" to reappear. + std::string begin_str = Key(0), end_str = Key(1); + Slice begin = begin_str, end = end_str; + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_EQ(vals[1], Get(Key(1))); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::vector values; + // File with keys [ 0 => 99 ] + for (int i = 0; i < 100; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1", FilesPerLevel(0)); + // Compaction will do L0=>L1 (trivial move) then move L1 files to L3 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 3; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // File with keys [ 100 => 199 ] + for (int i = 100; i < 200; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 4); + ASSERT_EQ(non_trivial_move, 0); + + for (int i = 0; i < 200; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to fill up first path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // Always gets compacted into 1 Level1 file, + // 0/1 Level 0 file + for (int num = 0; num < 3; num++) { + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + std::vector option_vector; + option_vector.emplace_back(options); + ColumnFamilyOptions cf_opt1(options), cf_opt2(options); + // Configure CF1 specific paths. + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt1); + CreateColumnFamilies({"one"}, option_vector[1]); + + // Configure CF2 specific paths. + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt2); + CreateColumnFamilies({"two"}, option_vector[2]); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + Random rnd(301); + int key_idx = 0; + int key_idx1 = 0; + int key_idx2 = 0; + + auto generate_file = [&]() { + GenerateNewFile(0, &rnd, &key_idx); + GenerateNewFile(1, &rnd, &key_idx1); + GenerateNewFile(2, &rnd, &key_idx2); + }; + + auto check_sstfilecount = [&](int path_id, int expected) { + ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path)); + }; + + auto check_filesperlevel = [&](const std::string& expected) { + ASSERT_EQ(expected, FilesPerLevel(0)); + ASSERT_EQ(expected, FilesPerLevel(1)); + ASSERT_EQ(expected, FilesPerLevel(2)); + }; + + auto check_getvalues = [&]() { + for (int i = 0; i < key_idx; i++) { + auto v = Get(0, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx1; i++) { + auto v = Get(1, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx2; i++) { + auto v = Get(2, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + }; + + // Check that default column family uses db_paths. + // And Column family "one" uses cf_paths. + + // The compaction in level0 outputs the sst files in level1. + // The first path cannot hold level1's data(400KB+400KB > 500KB), + // so every compaction move a sst file to second path. Please + // refer to LevelCompactionBuilder::GetPathId. + for (int num = 0; num < 3; num++) { + generate_file(); + } + check_sstfilecount(0, 1); + check_sstfilecount(1, 2); + + generate_file(); + check_sstfilecount(1, 3); + + // (1, 4) + generate_file(); + check_filesperlevel("1,4"); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + // (1, 4, 1) + generate_file(); + check_filesperlevel("1,4,1"); + check_sstfilecount(2, 1); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + // (1, 4, 2) + generate_file(); + check_filesperlevel("1,4,2"); + check_sstfilecount(2, 2); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + check_getvalues(); + + { // Also verify GetLiveFilesStorageInfo with db_paths / cf_paths + std::vector new_infos; + LiveFilesStorageInfoOptions lfsio; + lfsio.wal_size_for_flush = UINT64_MAX; // no flush + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsio, &new_infos)); + std::unordered_map live_sst_by_dir; + for (auto& info : new_infos) { + if (info.file_type == kTableFile) { + live_sst_by_dir[info.directory]++; + // Verify file on disk (no directory confusion) + uint64_t size; + ASSERT_OK(env_->GetFileSize( + info.directory + "/" + info.relative_filename, &size)); + ASSERT_EQ(info.size, size); + } + } + ASSERT_EQ(3U * 3U, live_sst_by_dir.size()); + for (auto& paths : {options.db_paths, cf_opt1.cf_paths, cf_opt2.cf_paths}) { + ASSERT_EQ(1, live_sst_by_dir[paths[0].path]); + ASSERT_EQ(4, live_sst_by_dir[paths[1].path]); + ASSERT_EQ(2, live_sst_by_dir[paths[2].path]); + } + } + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + check_getvalues(); + + Destroy(options, true); +} + +TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { + Random rnd(301); + int max_key_level_insert = 200; + int max_key_universal_insert = 600; + + // Stage 1: generate a db with level compaction + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_bytes_for_level_base = 500 << 10; // 500KB + options.max_bytes_for_level_multiplier = 1; + options.target_file_size_base = 200 << 10; // 200KB + options.target_file_size_multiplier = 1; + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i <= max_key_level_insert; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + } + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_GT(TotalTableFiles(1, 4), 1); + int non_level0_num_files = 0; + for (int i = 1; i < options.num_levels; i++) { + non_level0_num_files += NumTableFilesAtLevel(i, 1); + } + ASSERT_GT(non_level0_num_files, 0); + + // Stage 2: reopen with universal compaction - should fail + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options = CurrentOptions(options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Stage 3: compact into a single file and move the file to level 0 + options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = INT_MAX; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = INT_MAX; + options.max_bytes_for_level_multiplier = 1; + options.num_levels = 4; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 0; + // cannot use kForceOptimized here because the compaction here is expected + // to generate one output file + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK( + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + + // Only 1 file in L0 + ASSERT_EQ("1", FilesPerLevel(1)); + + // Stage 4: re-open in universal compaction style and do some db operations + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 4; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 3; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + options.num_levels = 1; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + } + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + for (int i = 1; i < options.num_levels; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // verify keys inserted in both level compaction style and universal + // compaction style + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= max_key_universal_insert; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); +} + +TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "b", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "b")); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("(a->v)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "e")); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "c", "cv")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "d", "dv")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "", "")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "d")); + ASSERT_OK(Delete(1, "b")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("(->)(c->cv)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, ManualAutoRace) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + // Generate four files in CF 0, which should trigger an auto compaction + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + + // The auto compaction is scheduled but waited until here + TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); + // The auto compaction will wait until the manual compaction is registerd + // before processing so that it will be cancelled. + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(1)); + + // Eventually the cancelled compaction will be rescheduled and executed. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, ManualCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = max_subcompactions_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p", "q"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + uint64_t prev_block_cache_add = + options.statistics->getTickerCount(BLOCK_CACHE_ADD); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); + // Verify manual compaction doesn't fill block cache + ASSERT_EQ(prev_block_cache_add, + options.statistics->getTickerCount(BLOCK_CACHE_ADD)); + + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + if (iter == 0) { + options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(1, "p", "begin")); + ASSERT_OK(Put(1, "q", "end")); + ASSERT_OK(Flush(1)); + } + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p", "q", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Populate a different range + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(1, "c", "begin")); + ASSERT_OK(Put(1, "e", "end")); + ASSERT_OK(Flush(1)); + } + ASSERT_EQ("3,1", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compact all + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("1,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + if (iter == 0) { + DestroyAndReopen(options); + options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_background_flushes = 1; + options.num_levels = 3; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + const size_t num_files = CountLiveFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + } + ASSERT_EQ(CountLiveFiles(), num_files); + } while (ChangeCompactOptions()); +} + +// Check level comapction with compact files +TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.level0_stop_writes_trigger = 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + int output_level = static_cast(cf_meta.levels.size()) - 1; + for (int file_picked = 5; file_picked > 0; --file_picked) { + std::set overlapping_file_names; + std::vector compaction_input_file_names; + for (int f = 0; f < file_picked; ++f) { + int level = 0; + auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); + compaction_input_file_names.push_back(file_meta->name); + GetOverlappingFileNumbersForLevelCompaction( + cf_meta, options.comparator, level, output_level, file_meta, + &overlapping_file_names); + } + + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, + output_level)); + + // Make sure all overlapping files do not exist after compaction + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult(cf_meta, overlapping_file_names); + } + + // make sure all key-values are still there. + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND"); + } +} + +TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { + Options options; + const int kKeySize = 16; + const int kKvSize = 1000; + const int kKeysPerBuffer = 100; + const int kNumL1Files = 5; + options.create_if_missing = true; + options.write_buffer_size = kKeysPerBuffer * kKvSize; + options.max_write_buffer_number = 2; + options.target_file_size_base = + options.write_buffer_size * (options.max_write_buffer_number - 1); + options.level0_file_num_compaction_trigger = kNumL1Files; + options.max_bytes_for_level_base = + options.level0_file_num_compaction_trigger * + options.target_file_size_base; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + // stop the compaction thread until we simulate the file creation failure. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + options.env = env_; + + DestroyAndReopen(options); + + const int kNumInsertedKeys = options.level0_file_num_compaction_trigger * + (options.max_write_buffer_number - 1) * + kKeysPerBuffer; + + Random rnd(301); + std::vector keys; + std::vector values; + for (int k = 0; k < kNumInsertedKeys; ++k) { + keys.emplace_back(rnd.RandomString(kKeySize)); + values.emplace_back(rnd.RandomString(kKvSize - kKeySize)); + ASSERT_OK(Put(Slice(keys[k]), Slice(values[k]))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + // Make sure the number of L0 files can trigger compaction. + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + + auto previous_num_level0_files = NumTableFilesAtLevel(0); + + // Fail the first file creation. + env_->non_writable_count_ = 1; + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Expect compaction to fail here as one file will fail its + // creation. + ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok()); + + // Verify L0 -> L1 compaction does fail. + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Verify all L0 files are still there. + ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files); + + // All key-values must exist after compaction fails. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } + + env_->non_writable_count_ = 0; + + // Make sure RocksDB will not get into corrupted state. + Reopen(options); + + // Verify again after reopen. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } +} + +TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { + // iter 1 -- delete_obsolete_files_period_micros == 0 + for (int iter = 0; iter < 2; ++iter) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + if (iter == 1) { + options.delete_obsolete_files_period_micros = 0; + } + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute L0->L1 + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // block compactions + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + Reopen(options); + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_EQ("0,1", FilesPerLevel(0)); + // let compactions go + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + + // this should execute L1->L2 (move) + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto moved_file_name = metadata[0].name; + + // Create two more 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->L2 (merge with previous file) + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // iterator is holding the file + ASSERT_OK(env_->FileExists(dbname_ + moved_file_name)); + + listener->SetExpectedFileName(dbname_ + moved_file_name); + ASSERT_OK(iterator->status()); + iterator.reset(); + + // this file should have been compacted away + ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name)); + listener->VerifyMatchedCount(1); + } +} + +TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) { + if (!Zlib_Supported()) { + return; + } + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + // First two levels have no compression, so that a trivial move between + // them will be allowed. Level 2 has Zlib compression so that a trivial + // move to level 3 will not be allowed + options.compression_per_level = {kNoCompression, kNoCompression, + kZlibCompression}; + int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Compaction::InputCompressionMatchesOutput:Matches", + [&](void* /*arg*/) { matches++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Compaction::InputCompressionMatchesOutput:DidntMatch", + [&](void* /*arg*/) { didnt_match++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are going to level 0 + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to fill up level 0 + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(4, GetSstFileCount(dbname_)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4", FilesPerLevel(0)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + + ASSERT_EQ(matches, 12); + // Currently, the test relies on the number of calls to + // InputCompressionMatchesOutput() per compaction. + const int kCallsToInputCompressionMatch = 2; + ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch); + ASSERT_EQ(trivial_move, 12); + ASSERT_EQ(non_trivial, 8); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) { + Options options = CurrentOptions(); + options.max_background_compactions = 5; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit); + + options.max_background_compactions = 3; + options.soft_pending_compaction_bytes_limit = 200; + options.hard_pending_compaction_bytes_limit = 150; + DestroyAndReopen(options); + ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit); +} + +// This tests for a bug that could cause two level0 compactions running +// concurrently +// TODO(aekmekji): Make sure that the reason this fails when run with +// max_subcompactions > 1 is not a correctness issue but just inherent to +// running parallel L0-L1 compactions +TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.max_background_compactions = 2; + + DestroyAndReopen(options); + + // fill up the DB + Random rnd(301); + for (int num = 0; num < 10; num++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"}, + {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2", + "CompactionJob::Run():End"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // trigger L0 compaction + for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; + num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"); + + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; + num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +static std::string ShortKey(int i) { + assert(i < 10000); + char buf[100]; + snprintf(buf, sizeof(buf), "key%04d", i); + return std::string(buf); +} + +TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The key size is guaranteed to be <= 8 + class ShortKeyComparator : public Comparator { + int Compare(const ROCKSDB_NAMESPACE::Slice& a, + const ROCKSDB_NAMESPACE::Slice& b) const override { + assert(a.size() <= 8); + assert(b.size() <= 8); + return BytewiseComparator()->Compare(a, b); + } + const char* Name() const override { return "ShortKeyComparator"; } + void FindShortestSeparator( + std::string* start, + const ROCKSDB_NAMESPACE::Slice& limit) const override { + return BytewiseComparator()->FindShortestSeparator(start, limit); + } + void FindShortSuccessor(std::string* key) const override { + return BytewiseComparator()->FindShortSuccessor(key); + } + } short_key_cmp; + Options options = CurrentOptions(); + options.target_file_size_base = 100000000; + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + options.comparator = &short_key_cmp; + DestroyAndReopen(options); + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::vector values; + // File with keys [ 0 => 99 ] + for (int i = 0; i < 100; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1", FilesPerLevel(0)); + // Compaction will do L0=>L1 (trivial move) then move L1 files to L3 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 3; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // File with keys [ 100 => 199 ] + for (int i = 100; i < 200; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + // then compacte the bottommost level L3=>L3 (non trivial move) + compact_options = CompactRangeOptions(); + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 4); + ASSERT_EQ(non_trivial_move, 1); + + // File with keys [ 200 => 299 ] + for (int i = 200; i < 300; i++) { + values.push_back(rnd.RandomString(value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + trivial_move = 0; + non_trivial_move = 0; + compact_options = CompactRangeOptions(); + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + // and will skip bottommost level compaction + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 3); + ASSERT_EQ(non_trivial_move, 0); + + for (int i = 0; i < 300; i++) { + ASSERT_EQ(Get(ShortKey(i)), values[i]); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, IntraL0Compaction) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.write_buffer_size = 2 << 20; // 2MB + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(64 << 20); // 64MB + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(rnd.RandomString(kValueSize)); + + // The L0->L1 must be picked before we begin flushing files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // index: 0 1 2 3 4 5 6 7 8 9 + // size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB + // score: 1.5 1.3 1.5 2.0 inf + // + // Files 0-4 will be included in an L0->L1 compaction. + // + // L0->L0 will be triggered since the sync points guarantee compaction to base + // level is still blocked when files 5-9 trigger another compaction. + // + // Files 6-9 are the longest span of available files for which + // work-per-deleted-file decreases (see "score" row above). + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(0), "")); // prevents trivial move + if (i == 5) { + TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready"); + ASSERT_OK(Put(Key(i + 1), value + value)); + } else { + ASSERT_OK(Put(Key(i + 1), value)); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1 + // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0) + ASSERT_EQ(2, level_to_files[0].size()); + ASSERT_GT(level_to_files[1].size(), 0); + for (int i = 0; i < 2; ++i) { + ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21); + } + + // The index/filter in the file produced by intra-L0 should not be pinned. + // That means clearing unref'd entries in block cache and re-accessing the + // file produced by intra-L0 should bump the index block miss count. + uint64_t prev_index_misses = + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + table_options.block_cache->EraseUnRefEntries(); + ASSERT_EQ("", Get(Key(0))); + ASSERT_EQ(prev_index_misses + 1, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); +} + +TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) { + // regression test for issue #2722: L0->L0 compaction can resurrect deleted + // keys from older L0 files if L1+ files' key-ranges do not include the key. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(rnd.RandomString(kValueSize)); + + // The L0->L1 must be picked before we begin flushing files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:" + "L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // index: 0 1 2 3 4 5 6 7 8 9 + // size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB + // score: 1.25 1.33 1.5 2.0 inf + // + // Files 0-4 will be included in an L0->L1 compaction. + // + // L0->L0 will be triggered since the sync points guarantee compaction to base + // level is still blocked when files 5-9 trigger another compaction. All files + // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing. + // + // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the + // L0->L0 preserves the deletion such that the key remains deleted. + for (int i = 0; i < 10; ++i) { + // key 0 serves both to prevent trivial move and as the key we want to + // verify is not resurrected by L0->L0 compaction. + if (i < 5) { + ASSERT_OK(Put(Key(0), "")); + } else { + ASSERT_OK(Delete(Key(0))); + } + if (i == 5) { + TEST_SYNC_POINT( + "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:" + "L0ToL1Ready"); + } + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1 + // L0 has a single output file from L0->L0 + ASSERT_EQ(1, level_to_files[0].size()); + ASSERT_GT(level_to_files[1].size(), 0); + ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22); + + ReadOptions roptions; + std::string result; + ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound()); +} + +TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) { + const int kNumFilesTrigger = 3; + Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); + for (bool use_universal_compaction : {false, true}) { + Options options = CurrentOptions(); + if (use_universal_compaction) { + options.compaction_style = kCompactionStyleUniversal; + } else { + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + } + options.num_levels = 4; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + + int num_bottom_pri_compactions = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkBottomCompaction", + [&](void* /*arg*/) { ++num_bottom_pri_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int num = 0; num < kNumFilesTrigger; num++) { + ASSERT_EQ(NumSortedRuns(), num); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(1, num_bottom_pri_compactions); + + // Verify that size amplification did occur + ASSERT_EQ(NumSortedRuns(), 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); +} + +TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) { + // Deletions can be dropped when compacted to non-last level if they fall + // outside the lower-level files' key-ranges. + const int kNumL0Files = 4; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + // put key 1 and 3 in separate L1, L2 files. + // So key 0, 2, and 4+ fall outside these levels' key-ranges. + for (int level = 2; level >= 1; --level) { + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put(Key(2 * i + 1), "val")); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(level); + ASSERT_EQ(2, NumTableFilesAtLevel(level)); + } + + // Delete keys in range [1, 4]. These L0 files will be compacted with L1: + // - Tombstones for keys 2 and 4 can be dropped early. + // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges. + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Delete(Key(i + 1))); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + for (int i = 0; i < kNumL0Files; ++i) { + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound()); + } + ASSERT_EQ(2, options.statistics->getTickerCount( + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE)); + ASSERT_EQ(2, + options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE)); +} + +TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) { + // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/ + // CompactFiles() had a bug where it failed to pick a compaction when an L0 + // compaction existed, but marked it as scheduled anyways. It'd never be + // unmarked as scheduled, so future compactions or DB close could hang. + const int kNumL0Files = 5; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files - 1; + options.max_background_compactions = 2; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::CompactFilesPendingL0Bug:Picked"}, + {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + auto schedule_multi_compaction_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // Files 0-3 will be included in an L0->L1 compaction. + // + // File 4 will be included in a call to CompactFiles() while the first + // compaction is running. + for (int i = 0; i < kNumL0Files - 1; ++i) { + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Put(Key(i + 1), "val")); + ASSERT_OK(Flush()); + } + TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked"); + // file 4 flushed after 0-3 picked + ASSERT_OK(Put(Key(kNumL0Files), "val")); + ASSERT_OK(Flush()); + + // previously DB close would hang forever as this situation caused scheduled + // compactions count to never decrement to zero. + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size()); + std::vector input_filenames; + input_filenames.push_back(cf_meta.levels[0].files.front().name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames, + 0 /* output_level */)); + TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) { + // Regression test for bug of not pulling in L0 files that overlap the user- + // specified input files in time- and key-ranges. + ASSERT_OK(Put(Key(0), "old_val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(0), "new_val")); + ASSERT_OK(Flush()); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_GE(cf_meta.levels.size(), 2); + ASSERT_EQ(2, cf_meta.levels[0].files.size()); + + // Compacting {new L0 file, L1 file} should pull in the old L0 file since it + // overlaps in key-range and time-range. + std::vector input_filenames; + input_filenames.push_back(cf_meta.levels[0].files.front().name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames, + 1 /* output_level */)); + ASSERT_EQ("new_val", Get(Key(0))); +} + +TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + const Snapshot* snapshot = nullptr; + const int kMaxKey = 10; + + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + ASSERT_OK(Delete(Key(i))); + if (!snapshot) { + snapshot = db_->GetSnapshot(); + } + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey))); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // test DeleteFilesInRange() deletes the files already picked for compaction + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifestStart", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Finish", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // release snapshot which mark bottommost file for compaction + db_->ReleaseSnapshot(snapshot); + std::string begin_string = Key(0); + std::string end_string = Key(kMaxKey + 1); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released, we should see files with many + // such deletions undergo single-file compactions. + const int kNumKeysPerFile = 1024; + const int kNumLevelFiles = 4; + const int kValueSize = 128; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumLevelFiles; + // inflate it a bit to account for key/metadata overhead + options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100; + CreateAndReopenWithCF({"one"}, options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + if (i == kNumLevelFiles - 1) { + snapshot = db_->GetSnapshot(); + // delete every other key after grabbing a snapshot, so these deletions + // and the keys they cover can't be dropped until after the snapshot is + // released. + for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) { + ASSERT_OK(Delete(Key(j))); + } + } + ASSERT_OK(Flush()); + if (i < kNumLevelFiles - 1) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata, post_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + // release snapshot and wait for compactions to finish. Single-file + // compactions should be triggered, which reduce the size of each bottom-level + // file without changing file count. + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kBottommostFiles); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); + + for (size_t i = 0; i < pre_release_metadata.size(); ++i) { + const auto& pre_file = pre_release_metadata[i]; + const auto& post_file = post_release_metadata[i]; + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // each file is smaller than it was before as it was rewritten without + // deletion markers/deleted keys. + ASSERT_LT(post_file.size, pre_file.size); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released, we should see files with many + // such deletions undergo single-file compactions. But when disabling auto + // compactions, it shouldn't be triggered which may causing too many + // background jobs. + const int kNumKeysPerFile = 1024; + const int kNumLevelFiles = 4; + const int kValueSize = 128; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumLevelFiles; + // inflate it a bit to account for key/metadata overhead + options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100; + Reopen(options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + if (i == kNumLevelFiles - 1) { + snapshot = db_->GetSnapshot(); + // delete every other key after grabbing a snapshot, so these deletions + // and the keys they cover can't be dropped until after the snapshot is + // released. + for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) { + ASSERT_OK(Delete(Key(j))); + } + } + ASSERT_OK(Flush()); + if (i < kNumLevelFiles - 1) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr)); + ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata, post_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + + // release snapshot and no compaction should be triggered. + std::atomic num_compactions{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { num_compactions.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + db_->ReleaseSnapshot(snapshot); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, num_compactions); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); + for (size_t i = 0; i < pre_release_metadata.size(); ++i) { + const auto& pre_file = pre_release_metadata[i]; + const auto& post_file = post_release_metadata[i]; + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // each file is same as before with deletion markers/deleted keys. + ASSERT_EQ(post_file.size, pre_file.size); + } +} + +TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 20; + options.ttl = 24 * 60 * 60; // 24 hours + options.compaction_pri = kRoundRobin; + env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + options.env = env_; + + // add a small second for each wait time, to make sure the file is expired + int small_seconds = 1; + + std::atomic_int ttl_compactions{0}; + std::atomic_int round_robin_ttl_compactions{0}; + std::atomic_int other_compactions{0}; + + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } else if (compaction_reason == CompactionReason::kRoundRobinTtl) { + round_robin_ttl_compactions++; + } else { + other_compactions++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + // Setup the files from lower level to up level, each file is 1 hour's older + // than the next one. + // create 10 files on the last level (L6) + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour + } + MoveFilesToLevel(6); + + // create 5 files on L5 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 200; j++) { + ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); + } + MoveFilesToLevel(5); + + // create 3 files on L4 + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 300; j++) { + ASSERT_OK(Put(Key(i * 300 + j), "value" + std::to_string(i * 300 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); + } + MoveFilesToLevel(4); + + // The LSM tree should be like: + // L4: [0, 299], [300, 599], [600, 899] + // L5: [0, 199] [200, 399]...............[800, 999] + // L6: [0,99][100,199][200,299][300,399]...............[800,899][900,999] + ASSERT_EQ("0,0,0,0,3,5,10", FilesPerLevel()); + + // make sure the first L5 file is expired + env_->MockSleepForSeconds(16 * 60 * 60 + small_seconds++); + + // trigger TTL compaction + ASSERT_OK(Put(Key(4), "value" + std::to_string(1))); + ASSERT_OK(Put(Key(5), "value" + std::to_string(1))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify there's a RoundRobin TTL compaction + ASSERT_EQ(1, round_robin_ttl_compactions); + round_robin_ttl_compactions = 0; + + // expire 2 more files + env_->MockSleepForSeconds(2 * 60 * 60 + small_seconds++); + // trigger TTL compaction + ASSERT_OK(Put(Key(4), "value" + std::to_string(2))); + ASSERT_OK(Put(Key(5), "value" + std::to_string(2))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(2, round_robin_ttl_compactions); + round_robin_ttl_compactions = 0; + + // expire 4 more files, 2 out of 3 files on L4 are expired + env_->MockSleepForSeconds(4 * 60 * 60 + small_seconds++); + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(3))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(3))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(1, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + + ASSERT_GT(round_robin_ttl_compactions, 0); + round_robin_ttl_compactions = 0; + + // make the first L0 file expired, which triggers a normal TTL compaction + // instead of roundrobin TTL compaction, it will also include an extra file + // from L0 because of overlap + ASSERT_EQ(0, ttl_compactions); + env_->MockSleepForSeconds(19 * 60 * 60 + small_seconds++); + + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(4))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // L0 -> L1 compaction is normal TTL compaction, L1 -> next levels compactions + // are RoundRobin TTL compaction. + ASSERT_GT(ttl_compactions, 0); + ttl_compactions = 0; + ASSERT_GT(round_robin_ttl_compactions, 0); + round_robin_ttl_compactions = 0; + + // All files are expired, so only the last level has data + env_->MockSleepForSeconds(24 * 60 * 60); + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(4))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); + + ASSERT_GT(ttl_compactions, 0); + ttl_compactions = 0; + ASSERT_GT(round_robin_ttl_compactions, 0); + round_robin_ttl_compactions = 0; + + ASSERT_EQ(0, other_compactions); +} + +TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) { + // This is to test the case that the RoundRobin compaction cursor not pointing + // to the oldest file, RoundRobin compaction should still compact the file + // after cursor until all expired files are compacted. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 20; + options.ttl = 24 * 60 * 60; // 24 hours + options.compaction_pri = kRoundRobin; + env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + options.env = env_; + + std::atomic_int ttl_compactions{0}; + std::atomic_int round_robin_ttl_compactions{0}; + std::atomic_int other_compactions{0}; + + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } else if (compaction_reason == CompactionReason::kRoundRobinTtl) { + round_robin_ttl_compactions++; + } else { + other_compactions++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + // create 10 files on the last level (L6) + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); // generate 1 file per hour + } + MoveFilesToLevel(6); + + // create 5 files on L5 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 200; j++) { + ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(60 * 60); // 1 hour + } + MoveFilesToLevel(5); + + // The LSM tree should be like: + // L5: [0, 199] [200, 399] [400,599] [600,799] [800, 999] + // L6: [0,99][100,199][200,299][300,399]....................[800,899][900,999] + ASSERT_EQ("0,0,0,0,0,5,10", FilesPerLevel()); + + // point the compaction cursor to the 4th file on L5 + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + VersionStorageInfo* storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + const InternalKey split_cursor = InternalKey(Key(600), 100000, kTypeValue); + storage_info->AddCursorForOneLevel(5, split_cursor); + + // make the first file on L5 expired, there should be 3 TTL compactions: + // 4th one, 5th one, then 1st one. + env_->MockSleepForSeconds(19 * 60 * 60 + 1); + // trigger TTL compaction + ASSERT_OK(Put(Key(6), "value" + std::to_string(4))); + ASSERT_OK(Put(Key(7), "value" + std::to_string(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(5)); + + ASSERT_EQ(3, round_robin_ttl_compactions); + ASSERT_EQ(0, ttl_compactions); + ASSERT_EQ(0, other_compactions); +} + +TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 1024; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + options.max_open_files = -1; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + // Delete previously written keys. + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + env_->MockSleepForSeconds(36 * 60 * 60); // 36 hours + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Just do a simple write + flush so that the Ttl expired files get + // compacted. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Test dynamically changing ttl. + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + // Delete previously written keys. + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Move time forward by 12 hours, and make sure that compaction still doesn't + // trigger as ttl is set to 24 hours. + env_->MockSleepForSeconds(12 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,2,0,2", FilesPerLevel()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Dynamically change ttl to 10 hours. + // This should trigger a ttl compaction, as 12 hours have already passed. + ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { + env_->SetMockSleep(); + const int kValueSize = 100; + + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + if (if_open_all_files) { + options.max_open_files = -1; + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 2; + }); + // In the case where all files are opened and doing DB restart + // forcing the oldest ancester time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int ttl_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Add two L6 files with key ranges: [1 .. 100], [101 .. 200]. + Random rnd(301); + for (int i = 1; i <= 100; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + // Get the first file's creation time. This will be the oldest file in the + // DB. Compactions inolving this file's descendents should keep getting + // this time. + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time; + // Add 1 hour and do another flush. + env_->MockSleepForSeconds(1 * 60 * 60); + for (int i = 101; i <= 200; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); + + env_->MockSleepForSeconds(1 * 60 * 60); + // Add two L4 files with key ranges: [1 .. 50], [51 .. 150]. + for (int i = 1; i <= 50; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(1 * 60 * 60); + for (int i = 51; i <= 150; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel()); + + env_->MockSleepForSeconds(1 * 60 * 60); + // Add one L1 file with key range: [26, 75]. + for (int i = 26; i <= 75; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + MoveFilesToLevel(1); + ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel()); + + // LSM tree: + // L1: [26 .. 75] + // L4: [1 .. 50][51 ..... 150] + // L6: [1 ........ 100][101 .... 200] + // + // On TTL expiry, TTL compaction should be initiated on L1 file, and the + // compactions should keep going on until the key range hits bottom level. + // In other words: the compaction on this data range "cascasdes" until + // reaching the bottom level. + // + // Order of events on TTL expiry: + // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the + // ttl + // compaction. + // 2. A TTL compaction happens between L3 and L4 files. Output file in L4. + // 3. The new output file from L4 falls to L5 via 1 trival move initiated + // by the ttl compaction. + // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6. + + // Add 25 hours and do a write + env_->MockSleepForSeconds(25 * 60 * 60); + + ASSERT_OK(Put(Key(1), "1")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(5, ttl_compactions); + + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time); + + env_->MockSleepForSeconds(25 * 60 * 60); + ASSERT_OK(Put(Key(2), "1")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GE(ttl_compactions, 6); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + } +} + +TEST_F(DBCompactionTest, LevelPeriodicCompaction) { + env_->SetMockSleep(); + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + if (if_open_all_files) { + options.max_open_files = -1; // needed for ttl compaction + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 0; + }); + // In the case where all files are opened and doing DB restart + // forcing the file creation time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 50 hours and do a write + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + MoveFilesToLevel(1); + ASSERT_EQ("0,3", FilesPerLevel()); + + // Add another 50 hours and do another write + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("b", "2")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,3", FilesPerLevel()); + // The three old files now go through the periodic compaction process. 2 + // + 3. + ASSERT_EQ(5, periodic_compactions); + + // Add another 50 hours and do another write + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("c", "3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,3", FilesPerLevel()); + // The four old files now go through the periodic compaction process. 5 + // + 4. + ASSERT_EQ(9, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + } +} + +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { + // This test makes sure that periodic compactions are working with a DB + // where file_creation_time of some files is 0. + // After compactions the new files are created with a valid file_creation_time + + const int kNumKeysPerFile = 32; + const int kNumFiles = 4; + const int kValueSize = 100; + + Options options = CurrentOptions(); + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int periodic_compactions = 0; + bool set_file_creation_time_to_zero = true; + bool set_creation_time_to_zero = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + if (set_file_creation_time_to_zero) { + props->file_creation_time = 0; + } + if (set_creation_time_to_zero) { + props->creation_time = 0; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + // Move the first two files to L2. + if (i == 1) { + MoveFilesToLevel(2); + set_creation_time_to_zero = false; + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2,0,2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + Close(); + + set_file_creation_time_to_zero = false; + // Forward the clock by 2 days. + env_->MockSleepForSeconds(2 * 24 * 60 * 60); + options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day + + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,2", FilesPerLevel()); + // Make sure that all files go through periodic compaction. + ASSERT_EQ(kNumFiles, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Options options = CurrentOptions(); + options.ttl = 10 * 60 * 60; // 10 hours + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + options.max_open_files = -1; // needed for both periodic and ttl compactions + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + int periodic_compactions = 0; + int ttl_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } else if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + MoveFilesToLevel(3); + + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + ASSERT_EQ(0, ttl_compactions); + + // Add some time greater than periodic_compaction_time. + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Files in the bottom level go through periodic compactions. + ASSERT_EQ("1,0,0,2", FilesPerLevel()); + ASSERT_EQ(2, periodic_compactions); + ASSERT_EQ(0, ttl_compactions); + + // Add a little more time than ttl + env_->MockSleepForSeconds(11 * 60 * 60); + ASSERT_OK(Put("b", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Notice that the previous file in level 1 falls down to the bottom level + // due to ttl compactions, one level at a time. + // And bottom level files don't get picked up for ttl compactions. + ASSERT_EQ("1,0,0,3", FilesPerLevel()); + ASSERT_EQ(2, periodic_compactions); + ASSERT_EQ(3, ttl_compactions); + + // Add some time greater than periodic_compaction_time. + env_->MockSleepForSeconds(50 * 60 * 60); + ASSERT_OK(Put("c", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Previous L0 file falls one level at a time to bottom level due to ttl. + // And all 4 bottom files go through periodic compactions. + ASSERT_EQ("1,0,0,4", FilesPerLevel()); + ASSERT_EQ(6, periodic_compactions); + ASSERT_EQ(6, ttl_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelTtlBooster) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 3; + const int kValueSize = 1000; + + Options options = CurrentOptions(); + options.ttl = 10 * 60 * 60; // 10 hours + options.periodic_compaction_seconds = 480 * 60 * 60; // very long + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize}; + options.max_open_files = -1; // needed for both periodic and ttl compactions + options.compaction_pri = CompactionPri::kMinOverlappingRatio; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + MoveFilesToLevel(2); + + ASSERT_EQ("0,0,3", FilesPerLevel()); + + // Create some files for L1 + for (int i = 0; i < 2; i++) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + ASSERT_EQ("0,1,3", FilesPerLevel()); + + // Make the new L0 files qualify TTL boosting and generate one more to trigger + // L1 -> L2 compaction. Old files will be picked even if their priority is + // lower without boosting. + env_->MockSleepForSeconds(8 * 60 * 60); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i), + rnd.RandomString(kValueSize * 2))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // Force files to be compacted to L1 + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1,2", FilesPerLevel()); + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}})); + + ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize); +} + +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { + class TestCompactionFilter : public CompactionFilter { + const char* Name() const override { return "TestCompactionFilter"; } + }; + class TestCompactionFilterFactory : public CompactionFilterFactory { + const char* Name() const override { return "TestCompactionFilterFactory"; } + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new TestCompactionFilter()); + } + }; + + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Random rnd(301); + + Options options = CurrentOptions(); + TestCompactionFilter test_compaction_filter; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + enum CompactionFilterType { + kUseCompactionFilter, + kUseCompactionFilterFactory + }; + + for (CompactionFilterType comp_filter_type : + {kUseCompactionFilter, kUseCompactionFilterFactory}) { + // Assert that periodic compactions are not enabled. + ASSERT_EQ(std::numeric_limits::max() - 1, + options.periodic_compaction_seconds); + + if (comp_filter_type == kUseCompactionFilter) { + options.compaction_filter = &test_compaction_filter; + options.compaction_filter_factory.reset(); + } else if (comp_filter_type == kUseCompactionFilterFactory) { + options.compaction_filter = nullptr; + options.compaction_filter_factory.reset( + new TestCompactionFilterFactory()); + } + DestroyAndReopen(options); + + // periodic_compaction_seconds should be set to the sanitized value when + // a compaction filter or a compaction filter factory is used. + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 31 days and do a write + env_->MockSleepForSeconds(31 * 24 * 60 * 60); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual + // compaction only triggers flush after it's sure stall won't be triggered for + // L0 file count going too high. + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + // i == 0: verifies normal case where stall is avoided by delay + // i == 1: verifies no delay in edge case where stall trigger is same as + // compaction trigger, so stall can't be avoided + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + if (i == 0) { + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + } else { + options.level0_file_num_compaction_trigger = kNumL0FilesLimit; + } + Reopen(options); + + if (i == 0) { + // ensure the auto compaction doesn't finish until manual compaction has + // had a chance to be delayed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "CompactionJob::Run():End"}}); + } else { + // ensure the auto-compaction doesn't finish until manual compaction has + // continued without delay. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:StallWaitDone", + "CompactionJob::Run():End"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put(Key(k), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + manual_compaction_thread.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual + // compaction only triggers flush after it's sure stall won't be triggered for + // immutable memtable count going too high. + const int kNumImmMemTableLimit = 8; + // i == 0: verifies normal case where stall is avoided by delay + // i == 1: verifies no delay in edge case where stall trigger is same as flush + // trigger, so stall can't be avoided + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + // the delay limit is one less than the stop limit. This test focuses on + // avoiding delay limit, but this option sets stop limit, so add one. + options.max_write_buffer_number = kNumImmMemTableLimit + 1; + if (i == 1) { + options.min_write_buffer_number_to_merge = kNumImmMemTableLimit; + } + Reopen(options); + + if (i == 0) { + // ensure the flush doesn't finish until manual compaction has had a + // chance to be delayed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "FlushJob::WriteLevel0Table"}}); + } else { + // ensure the flush doesn't finish until manual compaction has continued + // without delay. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:StallWaitDone", + "FlushJob::WriteLevel0Table"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) { + ASSERT_OK(Put(Key(0), rnd.RandomString(1024))); + FlushOptions flush_opts; + flush_opts.wait = false; + flush_opts.allow_write_stall = true; + ASSERT_OK(dbfull()->Flush(flush_opts)); + } + + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + manual_compaction_thread.join(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay + // does not hang if CF is dropped or DB is closed + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it + // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to + // simulate what happens during Close as we can't call Close (it + // blocks on the auto-compaction, making a cycle). + for (int i = 0; i < 2; ++i) { + CreateAndReopenWithCF({"one"}, options); + // The calls to close CF/DB wait until the manual compaction stalls. + // The auto-compaction waits until the manual compaction finishes to ensure + // the signal comes from closing CF/DB, not from compaction making progress. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"}, + {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual", + "CompactionJob::Run():End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024))); + } + ASSERT_OK(Flush(1)); + } + auto manual_compaction_thread = port::Thread([this, i]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + if (i == 0) { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsColumnFamilyDropped()); + } else { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); + } + }); + + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"); + if (i == 0) { + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + } else { + dbfull()->CancelAllBackgroundWork(false /* wait */); + } + manual_compaction_thread.join(); + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, + // CompactRange skips its flush if the delay is long enough that the memtables + // existing at the beginning of the call have already been flushed. + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + Reopen(options); + + Random rnd(301); + // The manual flush includes the memtable that was active when CompactRange + // began. So it unblocks CompactRange and precludes its flush. Throughout the + // test, stall conditions are upheld via high L0 file count. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"}, + {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush", + "DBImpl::FlushMemTable:StallWaitDone"}, + {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // used for the delayable flushes + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + for (int i = 0; i < kNumL0FilesLimit - 1; ++i) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); + } + ASSERT_OK(dbfull()->Flush(flush_opts)); + } + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"); + ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024))); + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024))); + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); + manual_compaction_thread.join(); + + // If CompactRange's flush was skipped, the final Put above will still be + // in the active memtable. + std::string num_keys_in_memtable; + ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, + &num_keys_in_memtable)); + ASSERT_EQ(std::to_string(1), num_keys_in_memtable); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) { + // Verify memtable only gets flushed if it contains data overlapping the range + // provided to `CompactRange`. Tests all kinds of overlap/non-overlap. + const int kNumEndpointKeys = 5; + std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"}; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + + // One extra iteration for nullptr, which means left side of interval is + // unbounded. + for (int i = 0; i <= kNumEndpointKeys; ++i) { + Slice begin; + Slice* begin_ptr; + if (i == 0) { + begin_ptr = nullptr; + } else { + begin = keys[i - 1]; + begin_ptr = &begin; + } + // Start at `i` so right endpoint comes after left endpoint. One extra + // iteration for nullptr, which means right side of interval is unbounded. + for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) { + Slice end; + Slice* end_ptr; + if (j == kNumEndpointKeys) { + end_ptr = nullptr; + } else { + end = keys[j]; + end_ptr = &end; + } + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Put("d", "val")); + CompactRangeOptions compact_range_opts; + ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr)); + + uint64_t get_prop_tmp, num_memtable_entries = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + if (begin_ptr == nullptr || end_ptr == nullptr || + (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) { + // In this case `CompactRange`'s range overlapped in some way with the + // memtable's range, so flush should've happened. Then "b" and "d" won't + // be in the memtable. + ASSERT_EQ(0, num_memtable_entries); + } else { + ASSERT_EQ(2, num_memtable_entries); + // flush anyways to prepare for next iteration + ASSERT_OK(db_->Flush(FlushOptions())); + } + } + } +} + +TEST_F(DBCompactionTest, CompactionStatsTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + CompactionStatsCollector* collector = new CompactionStatsCollector(); + options.listeners.emplace_back(collector); + DestroyAndReopen(options); + + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + + VerifyCompactionStats(*cfd, *collector); +} + +TEST_F(DBCompactionTest, SubcompactionEvent) { + class SubCompactionEventListener : public EventListener { + public: + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + InstrumentedMutexLock l(&mutex_); + ASSERT_EQ(running_compactions_.find(ci.job_id), + running_compactions_.end()); + running_compactions_.emplace(ci.job_id, std::unordered_set()); + } + + void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& ci) override { + InstrumentedMutexLock l(&mutex_); + auto it = running_compactions_.find(ci.job_id); + ASSERT_NE(it, running_compactions_.end()); + ASSERT_EQ(it->second.size(), 0); + running_compactions_.erase(it); + } + + void OnSubcompactionBegin(const SubcompactionJobInfo& si) override { + InstrumentedMutexLock l(&mutex_); + auto it = running_compactions_.find(si.job_id); + ASSERT_NE(it, running_compactions_.end()); + auto r = it->second.insert(si.subcompaction_job_id); + ASSERT_TRUE(r.second); // each subcompaction_job_id should be different + total_subcompaction_cnt_++; + } + + void OnSubcompactionCompleted(const SubcompactionJobInfo& si) override { + InstrumentedMutexLock l(&mutex_); + auto it = running_compactions_.find(si.job_id); + ASSERT_NE(it, running_compactions_.end()); + auto r = it->second.erase(si.subcompaction_job_id); + ASSERT_EQ(r, 1); + } + + size_t GetRunningCompactionCount() { + InstrumentedMutexLock l(&mutex_); + return running_compactions_.size(); + } + + size_t GetTotalSubcompactionCount() { + InstrumentedMutexLock l(&mutex_); + return total_subcompaction_cnt_; + } + + private: + InstrumentedMutex mutex_; + std::unordered_map> running_compactions_; + size_t total_subcompaction_cnt_ = 0; + }; + + Options options = CurrentOptions(); + options.target_file_size_base = 1024; + options.level0_file_num_compaction_trigger = 10; + auto* listener = new SubCompactionEventListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + // generate 4 files @ L2 + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // generate 2 files @ L1 which overlaps with L2 files + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + ASSERT_EQ(FilesPerLevel(), "0,2,4"); + + CompactRangeOptions comp_opts; + comp_opts.max_subcompactions = 4; + Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr); + ASSERT_OK(s); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // make sure there's no running compaction + ASSERT_EQ(listener->GetRunningCompactionCount(), 0); + // and sub compaction is triggered + ASSERT_GT(listener->GetTotalSubcompactionCount(), 0); +} + +TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) { + // LSM setup: + // L1: [ba bz] + // L2: [a b] [c d] + // L3: [a b] [c d] + // + // Thread 1: Thread 2: + // Begin compacting all L2->L3 + // Compact [ba bz] L1->L3 + // End compacting all L2->L3 + // + // The compaction operation in thread 2 should be disallowed because the range + // overlaps with the compaction in thread 1, which also covers that range in + // L3. + Options options = CurrentOptions(); + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + Reopen(options); + + for (int level = 3; level >= 2; --level) { + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("c", "val")); + ASSERT_OK(Put("d", "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + ASSERT_OK(Put("ba", "val")); + ASSERT_OK(Put("bz", "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"}, + {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End", + "CompactFilesImpl:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + auto bg_thread = port::Thread([&]() { + // Thread 1 + std::vector filenames = collector->GetFlushedFiles(); + filenames.pop_back(); + ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames, + 3 /* output_level */)); + }); + + // Thread 2 + TEST_SYNC_POINT( + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"); + std::string filename = collector->GetFlushedFiles().back(); + ASSERT_FALSE( + db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */) + .ok()); + TEST_SYNC_POINT( + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End"); + + bg_thread.join(); +} + +TEST_F(DBCompactionTest, CompactionHasEmptyOutput) { + Options options = CurrentOptions(); + SstStatsCollector* collector = new SstStatsCollector(); + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(collector); + Reopen(options); + + // Make sure the L0 files overlap to prevent trivial move. + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("a")); + ASSERT_OK(Delete("b")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Expect one file creation to start for each flush, and zero for compaction + // since no keys are written. + ASSERT_EQ(2, collector->num_ssts_creation_started()); +} + +TEST_F(DBCompactionTest, CompactionLimiter) { + const int kNumKeysPerFile = 10; + const int kMaxBackgroundThreads = 64; + + struct CompactionLimiter { + std::string name; + int limit_tasks; + int max_tasks; + int tasks; + std::shared_ptr limiter; + }; + + std::vector limiter_settings; + limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr}); + limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr}); + limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr}); + + for (auto& ls : limiter_settings) { + ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks)); + } + + std::shared_ptr unique_limiter( + NewConcurrentTaskLimiter("unique_limiter", -1)); + + const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", "a", "b", "c", "d", "e", "f"}; + const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0]; + + std::unordered_map cf_to_limiter; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 * 1024; // 110KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 64; + options.level0_stop_writes_trigger = 64; + options.max_background_jobs = kMaxBackgroundThreads; // Enough threads + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + options.max_write_buffer_number = 10; // Enough memtables + DestroyAndReopen(options); + + std::vector option_vector; + option_vector.reserve(cf_count); + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ColumnFamilyOptions cf_opt(options); + if (cf == 0) { + // "Default" CF does't use compaction limiter + cf_opt.compaction_thread_limiter = nullptr; + } else if (cf == 1) { + // "1" CF uses bypass compaction limiter + unique_limiter->SetMaxOutstandingTask(-1); + cf_opt.compaction_thread_limiter = unique_limiter; + } else { + // Assign limiter by mod + auto& ls = limiter_settings[cf % 3]; + cf_opt.compaction_thread_limiter = ls.limiter; + cf_to_limiter[cf_names[cf]] = &ls; + } + option_vector.emplace_back(DBOptions(options), cf_opt); + } + + for (unsigned int cf = 1; cf < cf_count; cf++) { + CreateColumnFamilies({cf_names[cf]}, option_vector[cf]); + } + + ReopenWithColumnFamilies( + std::vector(cf_names, cf_names + cf_count), option_vector); + + port::Mutex mutex; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) { + const auto& cf_name = static_cast(arg)->GetName(); + auto iter = cf_to_limiter.find(cf_name); + if (iter != cf_to_limiter.end()) { + MutexLock l(&mutex); + ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks); + iter->second->max_tasks = + std::max(iter->second->max_tasks, iter->second->limit_tasks); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) { + const auto& cf_name = static_cast(arg)->GetName(); + auto iter = cf_to_limiter.find(cf_name); + if (iter != cf_to_limiter.end()) { + MutexLock l(&mutex); + ASSERT_GE(--iter->second->tasks, 0); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Block all compact threads in thread pool. + const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4; + const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks; + env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH); + env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW); + + test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks]; + + // Block all compaction threads in thread pool. + for (size_t i = 0; i < kTotalCompactTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_compact_tasks[i], Env::LOW); + sleeping_compact_tasks[i].WaitUntilSleeping(); + } + + int keyIndex = 0; + + for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(keyIndex++), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + } + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + } + + // Enough L0 files to trigger compaction + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_EQ(NumTableFilesAtLevel(0, cf), + options.level0_file_num_compaction_trigger); + } + + // Create more files for one column family, which triggers speed up + // condition, all compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(0, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(0, "", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 0)); + } + + // All CFs are pending compaction + ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW)); + + // Unblock all compaction threads + for (size_t i = 0; i < kTotalCompactTasks; i++) { + sleeping_compact_tasks[i].WakeUp(); + sleeping_compact_tasks[i].WaitUntilDone(); + } + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Max outstanding compact tasks reached limit + for (auto& ls : limiter_settings) { + ASSERT_EQ(ls.limit_tasks, ls.max_tasks); + ASSERT_EQ(0, ls.limiter->GetOutstandingTask()); + } + + // test manual compaction under a fully throttled limiter + int cf_test = 1; + unique_limiter->SetMaxOutstandingTask(0); + + // flush one more file to cf 1 + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf_test, Key(keyIndex++), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf_test, "", "")); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test])); + ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test)); + + Compact(cf_test, Key(0), Key(keyIndex)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam, + ::testing::Values(std::make_tuple(1, true), + std::make_tuple(1, false), + std::make_tuple(4, true), + std::make_tuple(4, false))); + +TEST_P(DBCompactionDirectIOTest, DirectIO) { + Options options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.use_direct_io_for_flush_and_compaction = GetParam(); + options.env = MockEnv::Create(Env::Default()); + Reopen(options); + bool readahead = false; + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", [&](void* arg) { + bool* use_direct_writes = static_cast(arg); + ASSERT_EQ(*use_direct_writes, + options.use_direct_io_for_flush_and_compaction); + }); + if (options.use_direct_io_for_flush_and_compaction) { + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; }); + } + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu"}, options); + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + Compact(1, "p", "q"); + ASSERT_EQ(readahead, options.use_direct_reads); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + Destroy(options); + delete options.env; +} + +INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest, + testing::Bool()); + +class CompactionPriTest : public DBTestBase, + public testing::WithParamInterface { + public: + CompactionPriTest() + : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) { + compaction_pri_ = GetParam(); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t compaction_pri_; +}; + +TEST_P(CompactionPriTest, Test) { + Options options = CurrentOptions(); + options.write_buffer_size = 16 * 1024; + options.compaction_pri = static_cast(compaction_pri_); + options.hard_pending_compaction_bytes_limit = 256 * 1024; + options.max_bytes_for_level_base = 64 * 1024; + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + + DestroyAndReopen(options); + + Random rnd(301); + const int kNKeys = 5000; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); + + for (int i = 0; i < kNKeys; i++) { + ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102))); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int i = 0; i < kNKeys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } +} + +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio, + CompactionPri::kRoundRobin)); + +TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) { + Options options = CurrentOptions(); + options.write_buffer_size = 16 * 1024; + options.max_bytes_for_level_base = 128 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_multiplier = 4; + options.num_levels = 3; + options.compression = kNoCompression; + + DestroyAndReopen(options); + + Random rnd(301); + + // 30 Files in L0 to trigger compactions between L1 and L2 + for (int i = 0; i < 30; i++) { + for (int j = 0; j < 16; j++) { + ASSERT_OK(Put(rnd.RandomString(24), rnd.RandomString(1000))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const std::vector compact_cursors = + storage_info->GetCompactCursors(); + + Reopen(options); + + VersionSet* const reopened_versions = dbfull()->GetVersionSet(); + assert(reopened_versions); + + ColumnFamilyData* const reopened_cfd = + reopened_versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(reopened_cfd, nullptr); + + Version* const reopened_current = reopened_cfd->current(); + ASSERT_NE(reopened_current, nullptr); + + const VersionStorageInfo* const reopened_storage_info = + reopened_current->storage_info(); + ASSERT_NE(reopened_storage_info, nullptr); + + const std::vector reopened_compact_cursors = + reopened_storage_info->GetCompactCursors(); + const auto icmp = reopened_storage_info->InternalComparator(); + ASSERT_EQ(compact_cursors.size(), reopened_compact_cursors.size()); + for (size_t i = 0; i < compact_cursors.size(); i++) { + if (compact_cursors[i].Valid()) { + ASSERT_EQ(0, + icmp->Compare(compact_cursors[i], reopened_compact_cursors[i])); + } else { + ASSERT_TRUE(!reopened_compact_cursors[i].Valid()); + } + } +} + +TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) { + const int kKeysPerBuffer = 100; + Options options = CurrentOptions(); + options.num_levels = 4; + options.max_bytes_for_level_multiplier = 2; + options.level0_file_num_compaction_trigger = 4; + options.target_file_size_base = kKeysPerBuffer * 1024; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024; + options.disable_auto_compactions = true; + // Setup 7 threads but limited subcompactions so that + // RoundRobin requires extra compactions from reserved threads + options.max_subcompactions = 1; + options.max_background_compactions = 7; + options.max_compaction_bytes = 100000000; + DestroyAndReopen(options); + env_->SetBackgroundThreads(7, Env::LOW); + + Random rnd(301); + const std::vector files_per_level = {0, 15, 25}; + for (int lvl = 2; lvl > 0; lvl--) { + for (int i = 0; i < files_per_level[lvl]; i++) { + for (int j = 0; j < kKeysPerBuffer; j++) { + // Add (lvl-1) to ensure nearly equivallent number of files + // in L2 are overlapped with fils selected to compact from + // L1 + ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)), + rnd.RandomString(1010))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(lvl); + ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + } + // 15 files in L1; 25 files in L2 + + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted. + bool num_planned_subcompactions_verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { + uint64_t num_planned_subcompactions = *(static_cast(arg)); + if (grab_pressure_token_) { + // 7 files are selected for round-robin under auto + // compaction. The number of planned subcompaction is restricted by + // the limited number of max_background_compactions + ASSERT_EQ(num_planned_subcompactions, 7); + } else { + ASSERT_EQ(num_planned_subcompactions, 1); + } + num_planned_subcompactions_verified = true; + }); + + // The following 3 dependencies have to be added to ensure the auto + // compaction and the pressure token is correctly enabled. Same for + // RoundRobinSubcompactionsUsingResources and + // DBCompactionTest.RoundRobinSubcompactionsShrinkResources + SyncPoint::GetInstance()->LoadDependency( + {{"RoundRobinSubcompactionsAgainstPressureToken:0", + "BackgroundCallCompaction:0"}, + {"CompactionJob::AcquireSubcompactionResources:0", + "RoundRobinSubcompactionsAgainstPressureToken:1"}, + {"RoundRobinSubcompactionsAgainstPressureToken:2", + "CompactionJob::AcquireSubcompactionResources:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0"); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1"); + std::unique_ptr pressure_token; + if (grab_pressure_token_) { + pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + } + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2"); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken, + RoundRobinSubcompactionsAgainstPressureToken, + testing::Bool()); + +TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { + const int kKeysPerBuffer = 200; + Options options = CurrentOptions(); + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.target_file_size_base = kKeysPerBuffer * 1024; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024; + options.disable_auto_compactions = true; + options.max_subcompactions = 1; + options.max_background_compactions = max_compaction_limits_; + // Set a large number for max_compaction_bytes so that one round-robin + // compaction is enough to make post-compaction L1 size less than + // the maximum size (this test assumes only one round-robin compaction + // is triggered by kLevelMaxLevelSize) + options.max_compaction_bytes = 100000000; + + DestroyAndReopen(options); + env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW); + + Random rnd(301); + const std::vector files_per_level = {0, 40, 100}; + for (int lvl = 2; lvl > 0; lvl--) { + for (int i = 0; i < files_per_level[lvl]; i++) { + for (int j = 0; j < kKeysPerBuffer; j++) { + // Add (lvl-1) to ensure nearly equivallent number of files + // in L2 are overlapped with fils selected to compact from + // L1 + ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)), + rnd.RandomString(1010))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(lvl); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + } + + // 40 files in L1; 100 files in L2 + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted. + bool num_planned_subcompactions_verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { + uint64_t num_planned_subcompactions = *(static_cast(arg)); + // More than 10 files are selected for round-robin under auto + // compaction. The number of planned subcompaction is restricted by + // the minimum number between available threads and compaction limits + ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions, + std::min(total_low_pri_threads_, max_compaction_limits_) - 1); + num_planned_subcompactions_verified = true; + }); + SyncPoint::GetInstance()->LoadDependency( + {{"RoundRobinSubcompactionsAgainstResources:0", + "BackgroundCallCompaction:0"}, + {"CompactionJob::AcquireSubcompactionResources:0", + "RoundRobinSubcompactionsAgainstResources:1"}, + {"RoundRobinSubcompactionsAgainstResources:2", + "CompactionJob::AcquireSubcompactionResources:1"}, + {"CompactionJob::ReleaseSubcompactionResources:0", + "RoundRobinSubcompactionsAgainstResources:3"}, + {"RoundRobinSubcompactionsAgainstResources:4", + "CompactionJob::ReleaseSubcompactionResources:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0"); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1"); + auto pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2"); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3"); + // We can reserve more threads now except one is being used + ASSERT_EQ(total_low_pri_threads_ - 1, + env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW)); + ASSERT_EQ( + total_low_pri_threads_ - 1, + env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW)); + TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4"); + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources, + RoundRobinSubcompactionsAgainstResources, + ::testing::Values(std::make_tuple(1, 5), + std::make_tuple(5, 1), + std::make_tuple(10, 5), + std::make_tuple(5, 10), + std::make_tuple(10, 10))); + +TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) { + const int kKeysPerBuffer = 200; + Options options = CurrentOptions(); + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.target_file_size_base = kKeysPerBuffer * 1024; + options.compaction_pri = CompactionPri::kRoundRobin; + options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024; + options.disable_auto_compactions = true; + options.max_subcompactions = max_subcompactions_; + options.max_background_compactions = 1; + options.max_compaction_bytes = 100000000; + // Similar experiment setting as above except the max_subcompactions + // is given by max_subcompactions_ (1 or 4), and we fix the + // additional resources as (1, 1) and thus no more extra resources + // can be used + DestroyAndReopen(options); + env_->SetBackgroundThreads(1, Env::LOW); + + Random rnd(301); + const std::vector files_per_level = {0, 33, 100}; + for (int lvl = 2; lvl > 0; lvl--) { + for (int i = 0; i < files_per_level[lvl]; i++) { + for (int j = 0; j < kKeysPerBuffer; j++) { + // Add (lvl-1) to ensure nearly equivallent number of files + // in L2 are overlapped with fils selected to compact from + // L1 + ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)), + rnd.RandomString(1010))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(lvl); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0)); + } + + // 33 files in L1; 100 files in L2 + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted. + bool num_planned_subcompactions_verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) { + uint64_t num_planned_subcompactions = *(static_cast(arg)); + // At most 4 files are selected for round-robin under auto + // compaction. The number of planned subcompaction is restricted by + // the max_subcompactions since no extra resources can be used + ASSERT_EQ(num_planned_subcompactions, options.max_subcompactions); + num_planned_subcompactions_verified = true; + }); + // No need to setup dependency for pressure token since + // AcquireSubcompactionResources may not be called and it anyway cannot + // reserve any additional resources + SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTest::RoundRobinWithoutAdditionalResources:0", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); + TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0"); + + ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.compression = kNoCompression; + options.write_buffer_size = 4 * 1024; + options.max_bytes_for_level_base = 64 * 1024; + options.max_bytes_for_level_multiplier = 4; + options.level0_file_num_compaction_trigger = 4; + options.compaction_pri = CompactionPri::kRoundRobin; + + DestroyAndReopen(options); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + VersionStorageInfo* storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const InternalKey split_cursor = InternalKey(Key(600), 100, kTypeValue); + storage_info->AddCursorForOneLevel(2, split_cursor); + + Random rnd(301); + + for (int i = 0; i < 50; i++) { + for (int j = 0; j < 50; j++) { + ASSERT_OK(Put(Key(j * 2 + i * 100), rnd.RandomString(102))); + } + } + // Add more overlapping files (avoid trivial move) to trigger compaction that + // output files in L2. Note that trivial move does not trigger compaction and + // in that case the cursor is not necessarily the boundary of file. + for (int i = 0; i < 50; i++) { + for (int j = 0; j < 50; j++) { + ASSERT_OK(Put(Key(j * 2 + 1 + i * 100), rnd.RandomString(1014))); + } + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + const auto icmp = cfd->current()->storage_info()->InternalComparator(); + // Files in level 2 should be split by the cursor + for (const auto& file : level_to_files[2]) { + ASSERT_TRUE( + icmp->Compare(file.smallest.Encode(), split_cursor.Encode()) >= 0 || + icmp->Compare(file.largest.Encode(), split_cursor.Encode()) < 0); + } +} + +class NoopMergeOperator : public MergeOperator { + public: + NoopMergeOperator() {} + + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* merge_out) const override { + std::string val("bar"); + merge_out->new_value = val; + return true; + } + + const char* Name() const override { return "Noop"; } +}; + +TEST_F(DBCompactionTest, PartialManualCompaction) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 10; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 10240; + DestroyAndReopen(opts); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK(Merge("foo", rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + MoveFilesToLevel(2); + + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop)); + uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2; + ASSERT_OK(dbfull()->SetOptions( + {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}})); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); +} + +TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { + // Regression test for bug where manual compaction hangs forever when the DB + // is in read-only mode. Verify it now at least returns, despite failing. + const int kNumL0Files = 4; + std::unique_ptr mock_env( + new FaultInjectionTestEnv(env_)); + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.env = mock_env.get(); + DestroyAndReopen(opts); + + Random rnd(301); + for (int i = 0; i < kNumL0Files; ++i) { + // Make sure files are overlapping in key-range to prevent trivial move. + ASSERT_OK(Put("key1", rnd.RandomString(1024))); + ASSERT_OK(Put("key2", rnd.RandomString(1024))); + ASSERT_OK(Flush()); + } + ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0)); + + // Enter read-only mode by failing a write. + mock_env->SetFilesystemActive(false); + // Make sure this is outside `CompactRange`'s range so that it doesn't fail + // early trying to flush memtable. + ASSERT_NOK(Put("key3", rnd.RandomString(1024))); + + // In the bug scenario, the first manual compaction would fail and forget to + // unregister itself, causing the second one to hang forever due to conflict + // with a non-running compaction. + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + Slice begin_key("key1"); + Slice end_key("key2"); + ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key)); + ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key)); + + // Close before mock_env destruct. + Close(); +} + +// ManualCompactionBottomLevelOptimization tests the bottom level manual +// compaction optimization to skip recompacting files created by Ln-1 to Ln +// compaction +TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 5; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 1024; + opts.max_bytes_for_level_multiplier = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + MoveFilesToLevel(2); + + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + int num = comp_stats[2].num_input_files_in_output_level; + ASSERT_EQ(num, 0); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + const std::vector& comp_stats2 = + internal_stats_ptr->TEST_GetCompactionStats(); + num = comp_stats2[2].num_input_files_in_output_level; + ASSERT_EQ(num, 0); +} + +TEST_F(DBCompactionTest, ManualCompactionMax) { + uint64_t l1_avg_size = 0, l2_avg_size = 0; + auto generate_sst_func = [&]() { + Random rnd(301); + for (auto i = 0; i < 100; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + + uint64_t total = 0; + for (const auto& file : level_to_files[1]) { + total += file.compensated_file_size; + } + l1_avg_size = total / level_to_files[1].size(); + + total = 0; + for (const auto& file : level_to_files[2]) { + total += file.compensated_file_size; + } + l2_avg_size = total / level_to_files[2].size(); + }; + + std::atomic_int num_compactions(0); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + + // with default setting (1.6G by default), it should cover all files in 1 + // compaction + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == 1); + + // split the compaction to 5 + int num_split = 5; + DestroyAndReopen(opts); + generate_sst_func(); + uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + opts.max_compaction_bytes = total_size / num_split; + opts.target_file_size_base = total_size / num_split; + Reopen(opts); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); + + // very small max_compaction_bytes, it should still move forward + opts.max_compaction_bytes = l1_avg_size / 2; + opts.target_file_size_base = l1_avg_size / 2; + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() > 10); + + // dynamically set the option + num_split = 2; + opts.max_compaction_bytes = 0; + DestroyAndReopen(opts); + generate_sst_func(); + total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + Status s = db_->SetOptions( + {{"max_compaction_bytes", std::to_string(total_size / num_split)}, + {"target_file_size_base", std::to_string(total_size / num_split)}}); + ASSERT_OK(s); + + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); +} + +TEST_F(DBCompactionTest, CompactionDuringShutdown) { + Options opts = CurrentOptions(); + opts.level0_file_num_compaction_trigger = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsShutdownInProgress()); + ASSERT_OK(dbfull()->error_handler_.GetBGError()); +} + +// FixFileIngestionCompactionDeadlock tests and verifies that compaction and +// file ingestion do not cause deadlock in the event of write stall triggered +// by number of L0 files reaching level0_stop_writes_trigger. +TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { + const int kNumKeysPerFile = 100; + // Generate SST files. + Options options = CurrentOptions(); + + // Generate an external SST file containing a single key, i.e. 99 + std::string sst_files_dir = dbname_ + "/sst_files/"; + ASSERT_OK(DestroyDir(env_, sst_files_dir)); + ASSERT_OK(env_->CreateDir(sst_files_dir)); + SstFileWriter sst_writer(EnvOptions(), options); + const std::string sst_file_path = sst_files_dir + "test.sst"; + ASSERT_OK(sst_writer.Open(sst_file_path)); + ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value")); + ASSERT_OK(sst_writer.Finish()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", + "BackgroundCallCompaction:0"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.write_buffer_size = 110 << 10; // 110KB + options.level0_file_num_compaction_trigger = + options.level0_stop_writes_trigger; + options.max_subcompactions = max_subcompactions_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + Random rnd(301); + + // Generate level0_stop_writes_trigger L0 files to trigger write stop + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + for (int j = 0; j != kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(990))); + } + if (i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i); + } + } + // When we reach this point, there will be level0_stop_writes_trigger L0 + // files and one extra key (99) in memory, which overlaps with the external + // SST file. Write stall triggers, and can be cleared only after compaction + // reduces the number of L0 files. + + // Compaction will also be triggered since we have reached the threshold for + // auto compaction. Note that compaction may begin after the following file + // ingestion thread and waits for ingestion to finish. + + // Thread to ingest file with overlapping key range with the current + // memtable. Consequently ingestion will trigger a flush. The flush MUST + // proceed without waiting for the write stall condition to clear, otherwise + // deadlock can happen. + port::Thread ingestion_thr([&]() { + IngestExternalFileOptions ifo; + Status s = db_->IngestExternalFile({sst_file_path}, ifo); + ASSERT_OK(s); + }); + + // More write to trigger write stop + ingestion_thr.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Close(); +} + +TEST_F(DBCompactionTest, ConsistencyFailTest) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency0", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put("foo", "bar")); + Status s = Flush(); + if (k < 1) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsCorruption()); + } + } + + ASSERT_NOK(Put("foo", "bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, ConsistencyFailTest2) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.target_file_size_base = 1000; + options.level0_file_num_compaction_trigger = 2; + BlockBasedTableOptions bbto; + bbto.block_size = 400; // small block size + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency1", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::string value = rnd.RandomString(1000); + + ASSERT_OK(Put("foo1", value)); + ASSERT_OK(Put("z", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo2", value)); + ASSERT_OK(Put("z", "")); + Status s = Flush(); + ASSERT_TRUE(s.ok() || s.IsCorruption()); + + // This probably returns non-OK, but we rely on the next Put() + // to determine the DB is frozen. + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); + ASSERT_NOK(Put("foo", "bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +void IngestOneKeyValue(DBImpl* db, const std::string& key, + const std::string& value, const Options& options) { + ExternalSstFileInfo info; + std::string f = test::PerThreadDBPath("sst_file" + key); + EnvOptions env; + ROCKSDB_NAMESPACE::SstFileWriter writer(env, options); + auto s = writer.Open(f); + ASSERT_OK(s); + // ASSERT_OK(writer.Put(Key(), "")); + ASSERT_OK(writer.Put(key, value)); + + ASSERT_OK(writer.Finish(&info)); + IngestExternalFileOptions ingest_opt; + + ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt)); +} + +TEST_P(DBCompactionTestWithParam, + FlushAfterIntraL0CompactionCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::atomic pick_intra_l0_count(0); + std::string value(rnd.RandomString(kValueSize)); + + // The L0->L1 must be picked before we begin ingesting files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTestWithParam::" + "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Flush 5 L0 sst. + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + // Put one key, to make smallest log sequence number in this memtable is less + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(0), "a")); + + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + TEST_SYNC_POINT( + "DBCompactionTestWithParam::" + "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"); + + // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. + for (int i = 5; i < 10; i++) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); + IngestOneKeyValue(dbfull(), Key(i), value, options); + } + + // Put one key, to make biggest log sequence number in this memtable is bigger + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(2), "b")); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files[0].size(), 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + + ASSERT_OK(Flush()); +} + +TEST_P(DBCompactionTestWithParam, + IntraL0CompactionAfterFlushCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + options.write_buffer_size = 2 << 20; + options.max_write_buffer_number = 6; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(rnd.RandomString(kValueSize)); + std::string value2(rnd.RandomString(kValueSize)); + std::string bigvalue = value + value; + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + std::atomic pick_intra_l0_count(0); + // The L0->L1 must be picked before we begin ingesting files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTestWithParam::" + "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Make 6 L0 sst. + for (int i = 0; i < 6; ++i) { + if (i % 2 == 0) { + IngestOneKeyValue(dbfull(), Key(i), value, options); + } else { + ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Flush()); + } + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + + // Stop run flush job + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_tasks; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks, + Env::Priority::HIGH); + sleeping_tasks.WaitUntilSleeping(); + + // Put many keys to make memtable request to flush + for (int i = 0; i < 6; ++i) { + ASSERT_OK(Put(Key(i), bigvalue)); + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + TEST_SYNC_POINT( + "DBCompactionTestWithParam::" + "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"); + // ingest file to trigger IntraL0Compaction + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); + IngestOneKeyValue(dbfull(), Key(i), value2, options); + } + + // Wake up flush job + sleeping_tasks.WakeUp(); + sleeping_tasks.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + uint64_t error_count = 0; + db_->GetIntProperty("rocksdb.background-errors", &error_count); + ASSERT_EQ(error_count, 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(bigvalue, Get(Key(i))); + } + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(value2, Get(Key(i))); + } +} + +TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { + constexpr int kSstNum = 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Generate some sst files on level 0 with sequence keys (no overlap) + for (int i = 0; i < kSstNum; i++) { + for (int j = 1; j < UCHAR_MAX; j++) { + auto key = std::string(kSstNum, '\0'); + key[kSstNum - i] += static_cast(j); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0)); + + auto cro = CompactRangeOptions(); + cro.bottommost_level_compaction = bottommost_level_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || + bottommost_level_compaction_ == + BottommostLevelCompaction::kForceOptimized) { + // Real compaction to compact all sst files from level 0 to 1 file on level + // 1 + ASSERT_EQ("0,1", FilesPerLevel(0)); + } else { + // Just trivial move from level 0 -> 1 + ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + } +} + +INSTANTIATE_TEST_CASE_P( + DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam, + ::testing::Values(BottommostLevelCompaction::kSkip, + BottommostLevelCompaction::kIfHaveCompactionFilter, + BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized)); + +TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + bool has_compaction = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 10); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10); + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); + + has_compaction = false; + ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}})); + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 2); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); +} + +TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.compaction_style = kCompactionStyleUniversal; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + bool has_compaction = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 10); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); + has_compaction = false; + + ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}})); + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 2); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); +} + +TEST_P(ChangeLevelConflictsWithAuto, TestConflict) { + // A `CompactRange()` may race with an automatic compaction, we'll need + // to make sure it doesn't corrupte the data. + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // Run a qury to refitting to level 1 while another thread writing to + // the same level. + SyncPoint::GetInstance()->LoadDependency({ + // The first two dependencies ensure the foreground creates an L0 file + // between the background compaction's L0->L1 and its L1->L2. + { + "DBImpl::CompactRange:BeforeRefit:1", + "AutoCompactionFinished1", + }, + { + "AutoCompactionFinished2", + "DBImpl::CompactRange:BeforeRefit:2", + }, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::thread auto_comp([&] { + TEST_SYNC_POINT("AutoCompactionFinished1"); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bar", "v3")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + TEST_SYNC_POINT("AutoCompactionFinished2"); + }); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = GetParam() ? 1 : 0; + // This should return non-OK, but it's more important for the test to + // make sure that the DB is not corrupted. + ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + auto_comp.join(); + // Refitting didn't happen. + SyncPoint::GetInstance()->DisableProcessing(); + + // Write something to DB just make sure that consistency check didn't + // fail and make the DB readable. +} + +INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto, + ChangeLevelConflictsWithAuto, testing::Bool()); + +TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) { + // A `CompactRange()` with `change_level == true` needs to execute its final + // step, `ReFitLevel()`, in isolation. Previously there was a bug where + // refitting could target the same level as an ongoing manual compaction, + // leading to overlapping files in that level. + // + // This test ensures that case is not possible by verifying any manual + // compaction issued during the `ReFitLevel()` phase fails with + // `Status::Incomplete`. + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with three levels populated. + Random rnd(301); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + GenerateNewFile(&rnd, &key_idx); + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1,2", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the + // foreground thread will try to simultaneously compact L0->L1. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // The first two dependencies ensure the foreground creates an L0 file + // between the background compaction's L0->L1 and its L1->L2. + { + "DBImpl::RunManualCompaction()::1", + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "PutFG", + }, + { + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "FlushedFG", + "DBImpl::RunManualCompaction()::2", + }, + // The next two dependencies ensure the foreground invokes + // `CompactRange()` while the background is refitting. The + // foreground's `CompactRange()` is guaranteed to attempt an L0->L1 + // as we set it up with an empty memtable and a new L0 file. + { + "DBImpl::CompactRange:PreRefitLevel", + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactFG", + }, + { + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactedFG", + "DBImpl::CompactRange:PostRefitLevel", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG"); + // Make sure we have something new to compact in the foreground. + // Note key 1 is carefully chosen as it ensures the file we create here + // overlaps with one of the files being refitted L2->L1 in the background. + // If we chose key 0, the file created here would not overlap. + ASSERT_OK(Put(Key(1), "val")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG"); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG"); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactedFG"); + refit_level_thread.join(); +} + +TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { + // This test is added to ensure that RefitLevel() error paths are clearing + // internal flags and to test that subsequent valid RefitLevel() calls + // succeeds + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + Reopen(options); + + ASSERT_EQ("", FilesPerLevel(0)); + + // Setup an LSM with three levels populated. + Random rnd(301); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1", FilesPerLevel(0)); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + auto start_idx = key_idx; + GenerateNewFile(&rnd, &key_idx); + GenerateNewFile(&rnd, &key_idx); + auto end_idx = key_idx - 1; + ASSERT_EQ("1,1,2", FilesPerLevel(0)); + + // Next two CompactRange() calls are used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + + // Trigger a refit to L1 first + { + std::string begin_string = Key(start_idx); + std::string end_string = Key(end_idx); + Slice begin(begin_string); + Slice end(end_string); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); + } + ASSERT_EQ("0,3,2", FilesPerLevel(0)); + + // Try a refit from L2->L1 - this should fail and exercise error paths in + // RefitLevel() + { + // Select key range that matches the bottom most level (L2) + std::string begin_string = Key(0); + std::string end_string = Key(start_idx - 1); + Slice begin(begin_string); + Slice end(end_string); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); + } + ASSERT_EQ("0,3,2", FilesPerLevel(0)); + + // Try a valid Refit request to ensure, the path is still working + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,5", FilesPerLevel(0)); +} + +TEST_F(DBCompactionTest, CompactionWithBlob) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), third_value); + ASSERT_EQ(Get(second_key), third_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_EQ(l1_files.size(), 1); + + const FileMetaData* const table_file = l1_files[0]; + ASSERT_NE(table_file, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.front(); + ASSERT_NE(blob_file, nullptr); + + ASSERT_EQ(table_file->smallest.user_key(), first_key); + ASSERT_EQ(table_file->largest.user_key(), second_key); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 2); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1); +} + +class DBCompactionTestBlobError + : public DBCompactionTest, + public testing::WithParamInterface { + public: + DBCompactionTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBCompactionTestBlobError, CompactionError) { + Options options; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_TRUE(l1_files.empty()); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_TRUE(blob_files.empty()); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 0); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } +} + +class DBCompactionTestBlobGC + : public DBCompactionTest, + public testing::WithParamInterface> { + public: + DBCompactionTestBlobGC() + : blob_gc_age_cutoff_(std::get<0>(GetParam())), + updated_enable_blob_files_(std::get<1>(GetParam())) {} + + double blob_gc_age_cutoff_; + bool updated_enable_blob_files_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC, + ::testing::Combine(::testing::Values(0.0, 0.5, 1.0), + ::testing::Bool())); + +TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.blob_file_size = 32; // one blob per file + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0; + + DestroyAndReopen(options); + + for (int i = 0; i < 128; i += 2) { + ASSERT_OK(Put("key" + std::to_string(i), "value" + std::to_string(i))); + ASSERT_OK( + Put("key" + std::to_string(i + 1), "value" + std::to_string(i + 1))); + ASSERT_OK(Flush()); + } + + std::vector original_blob_files = GetBlobFileNumbers(); + ASSERT_EQ(original_blob_files.size(), 128); + + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + + CompactRangeOptions cro; + cro.blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kForce; + cro.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Check that the GC stats are correct + { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + ASSERT_GE(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } + + const size_t cutoff_index = static_cast( + cro.blob_garbage_collection_age_cutoff * original_blob_files.size()); + const size_t expected_num_files = original_blob_files.size() - cutoff_index; + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_num_files); + + // Original blob files below the cutoff should be gone, original blob files + // at or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + for (size_t i = 0; i < 128; ++i) { + ASSERT_EQ(Get("key" + std::to_string(i)), "value" + std::to_string(i)); + } +} + +TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.blob_file_size = 32; // one blob per file + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 4); + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + size_t expected_number_of_files = original_blob_files.size(); + + if (!updated_enable_blob_files_) { + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + + expected_number_of_files -= cutoff_index; + } + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (blob_gc_age_cutoff_ > 0.0) { + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + + if (updated_enable_blob_files_) { + // GC relocated some blobs to new blob files + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, + compaction_stats[1].bytes_written_blob); + } else { + // GC moved some blobs back to the LSM, no new blob files + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } + } else { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + ASSERT_OK(Put(fourth_key, fourth_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex", + [](void* arg) { + Slice* const blob_index = static_cast(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // options is not set, the checksum handoff will not be triggered + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, FIFOWarm) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.num_levels = 1; + options.max_open_files = -1; + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + CompactionOptionsFIFO fifo_options; + fifo_options.age_for_warm = 1000; + fifo_options.max_table_files_size = 100000000; + options.compaction_options_fifo = fifo_options; + env_->SetMockSleep(); + Reopen(options); + + int total_warm = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile::FileOptions.temperature", [&](void* arg) { + Temperature temperature = *(static_cast(arg)); + if (temperature == Temperature::kWarm) { + total_warm++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(4, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature); + ASSERT_EQ(2, total_warm); + + Destroy(options); +} + +TEST_F(DBCompactionTest, DisableMultiManualCompaction) { + const int kNumL0Files = 10; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Generate 2 levels of file to make sure the manual compaction is not skipped + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), "value")); + if (i % 2) { + ASSERT_OK(Flush()); + } + } + MoveFilesToLevel(2); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), "value")); + if (i % 2) { + ASSERT_OK(Flush()); + } + } + MoveFilesToLevel(1); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + port::Thread compact_thread1([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + std::string begin_str = Key(0); + std::string end_str = Key(3); + Slice b = begin_str; + Slice e = end_str; + auto s = db_->CompactRange(cro, &b, &e); + ASSERT_TRUE(s.IsIncomplete()); + }); + + port::Thread compact_thread2([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + std::string begin_str = Key(4); + std::string end_str = Key(7); + Slice b = begin_str; + Slice e = end_str; + auto s = db_->CompactRange(cro, &b, &e); + ASSERT_TRUE(s.IsIncomplete()); + }); + + // Disable manual compaction should cancel both manual compactions and both + // compaction should return incomplete. + db_->DisableManualCompaction(); + + compact_thread1.join(); + compact_thread2.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +} + +TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) { + const int kNumL0Files = 4; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + // make sure the manual compaction background is started but not yet set the + // status to in_progress, then cancel the manual compaction, which should not + // result in segfault + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", + "DBCompactionTest::DisableJustStartedManualCompaction:" + "PreDisableManualCompaction"}, + {"DBImpl::RunManualCompaction:Unscheduled", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + TEST_SYNC_POINT( + "DBCompactionTest::DisableJustStartedManualCompaction:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + + compact_thread.join(); +} + +TEST_F(DBCompactionTest, DisableInProgressManualCompaction) { + const int kNumL0Files = 4; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction:InProgress", + "DBCompactionTest::DisableInProgressManualCompaction:" + "PreDisableManualCompaction"}, + {"DBImpl::RunManualCompaction:Unscheduled", + "CompactionJob::Run():Start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableInProgressManualCompaction:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + + compact_thread.join(); +} + +TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFull:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFull:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + db_->DisableManualCompaction(); + + // CompactRange should return before the compaction has the chance to run + compact_thread.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); +} + +TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + db_->DisableManualCompaction(); + + // CompactRange should return before the compaction has the chance to run + compact_thread.join(); + + // Try close DB while manual compaction is canceled but still in the queue. + // And an auto-triggered compaction is also in the queue. + auto s = db_->Close(); + ASSERT_OK(s); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBCompactionTest, DBCloseWithManualCompaction) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + // Close DB with manual compaction and auto triggered compaction in the queue. + auto s = db_->Close(); + ASSERT_OK(s); + + // manual compaction thread should return with Incomplete(). + compact_thread.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBCompactionTest, + DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) { + // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait + // for automatic compactions to drain before starting the manual compaction. + // This test verifies `DisableManualCompaction()` can cancel such a compaction + // without waiting for the drain to complete. + const int kNumL0Files = 4; + + // Enforces manual compaction enters wait loop due to pending automatic + // compaction. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + // The automatic compaction will cancel the waiting manual compaction. + // Completing this implies the cancellation did not wait on automatic + // compactions to finish. + bool callback_completed = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* /*arg*/) { + db_->DisableManualCompaction(); + callback_completed = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(callback_completed); +} + +TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) { + Options options = CurrentOptions(); + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with L2 populated. + Random rnd(301); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the foreground thread will + // attempt to run a compaction on new data. The following dependencies + // ensure the background manual compaction's refitting phase disables manual + // compaction immediately before the foreground manual compaction can register + // itself. Manual compaction is kept disabled until the foreground manual + // checks for the failure once. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // Only do Put()s for foreground CompactRange() once the background + // CompactRange() has reached the refitting phase. + { + "DBImpl::CompactRange:BeforeRefit:1", + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange", + }, + // Right before we register the manual compaction, proceed with + // the refitting phase so manual compactions are disabled. Stay in + // the refitting phase with manual compactions disabled until it is + // noticed. + { + "DBImpl::RunManualCompaction:0", + "DBImpl::CompactRange:BeforeRefit:2", + }, + { + "DBImpl::CompactRange:PreRefitLevel", + "DBImpl::RunManualCompaction:1", + }, + { + "DBImpl::RunManualCompaction:PausedAtStart", + "DBImpl::CompactRange:PostRefitLevel", + }, + // If compaction somehow were scheduled, let's let it run after reenabling + // manual compactions. This dependency is not expected to be hit but is + // here for speculatively coercing future bugs. + { + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled", + "BackgroundCallCompaction:0", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange"); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + + refit_level_thread.join(); +} + +TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) { + // Flushes several files to trigger compaction while lock is released during + // a bottom-pri compaction. Verifies it does not get scheduled to thread pool + // because per-DB limit for compaction parallelism is one (default). + const int kNumL0Files = 4; + const int kNumLevels = 3; + + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // Setup last level to be non-empty since it's a bit unclear whether + // compaction to an empty level would be considered "bottommost". + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(kNumLevels - 1); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkBottomCompaction", + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"}, + {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_range_thread([&] { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.exclusive_manual_compaction = false; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + // Sleep in the low-pri thread so any newly scheduled compaction will be + // queued. Otherwise it might finish before we check its existence. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"); + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction"); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + compact_range_thread.join(); +} + +TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) { + // allow_ingest_behind prevents seqnum zeroing, and could cause + // compaction loop with reason kBottommostFiles. + Options options = CurrentOptions(); + options.env = env_; + options.compaction_style = kCompactionStyleLevel; + options.allow_ingest_behind = true; + options.comparator = BytewiseComparator(); + DestroyAndReopen(options); + + WriteOptions write_opts; + ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop")); + ASSERT_OK(db_->Put(write_opts, "infinite", "loop")); + + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(db_->Put(write_opts, "bumpseqnum", "")); + ASSERT_OK(Flush()); + auto snapshot = db_->GetSnapshot(); + // Bump up oldest_snapshot_seqnum_ in VersionStorageInfo. + db_->ReleaseSnapshot(snapshot); + bool compacted = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) { + // There should not be a compaction. + compacted = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Wait for compaction to be scheduled. + env_->SleepForMicroseconds(2000000); + ASSERT_FALSE(compacted); + // The following assert can be used to check for compaction loop: + // it used to wait forever before the fix. + // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */)); +} + +#endif // !defined(ROCKSDB_LITE) + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void)argc; + (void)argv; + return 0; +#endif +} -- cgit v1.2.3