1 files changed, 1531 insertions, 0 deletions
diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc
new file mode 100644
index 00000000..ebe9366d
--- /dev/null
+++ b/src/rocksdb/db/db_range_del_test.cc
@@ -0,0 +1,1531 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+class DBRangeDelTest : public DBTestBase {
+ public:
+  DBRangeDelTest() : DBTestBase("/db_range_del_test") {}
+
+  std::string GetNumericStr(int key) {
+    uint64_t uint64_key = static_cast<uint64_t>(key);
+    std::string str;
+    str.resize(8);
+    memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
+    return str;
+  }
+};
+
+// PlainTableFactory and NumTableFilesAtLevel() are not supported in
+// ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
+  // TODO: figure out why MmapReads trips the iterator pinning assertion in
+  // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+  // least be explicitly unsupported.
+  for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+    option_config_ = config;
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "dr1", "dr1")
+                    .IsNotSupported());
+  }
+}
+
+TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "dr1", "dr2"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
+  do {
+    Options opts = CurrentOptions();
+    opts.disable_auto_compactions = true;
+    opts.statistics = CreateDBStatistics();
+    DestroyAndReopen(opts);
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
+    db_->Flush(FlushOptions());
+
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                true /* disallow_trivial_move */);
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+    // compactions as the above assertions about the number of files in a level
+    // do not hold true.
+  } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
+  // regression test for exactly filled compaction output files. Previously
+  // another file would be generated containing all range deletions, which
+  // could invalidate the non-overlapping file boundary invariant.
+  const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size_deviation = 50;  // each block holds two keys
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+      if (j == 0 && i > 0) {
+        dbfull()->TEST_WaitForFlushMemTable();
+      }
+    }
+  }
+  // put extra key to trigger final flush
+  ASSERT_OK(Put("", ""));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
+  // Ensures range deletion spanning multiple compaction output files that are
+  // cut by max_compaction_bytes will have non-overlapping key-ranges.
+  // https://github.com/facebook/rocksdb/issues/1778
+  const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.level0_file_num_compaction_trigger = kNumFiles;
+  opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  // Want max_compaction_bytes to trigger the end of compaction output file, not
+  // target_file_size_base, so make the latter much bigger
+  opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+  Reopen(opts);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // It spans the whole key-range, thus will be included in all output files
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             GetNumericStr(0),
+                             GetNumericStr(kNumFiles * kNumPerFile - 1)));
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 1MB (256 values, each 4K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, kBytesPerVal));
+      ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
+    }
+    // extra entry to trigger SpecialSkipListFactory's flush
+    ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), 2);
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+  for (size_t i = 0; i < files[1].size() - 1; ++i) {
+    ASSERT_TRUE(InternalKeyComparator(opts.comparator)
+                    .Compare(files[1][i].largest, files[1][i + 1].smallest) <
+                0);
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
+  // Regression test for bug where sentinel range deletions (i.e., ones with
+  // sequence number of zero) were included in output files.
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // gaps between ranges creates sentinels in our internal representation
+  std::vector<std::pair<std::string, std::string>> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}};
+  for (const auto& range_del : range_dels) {
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               range_del.first, range_del.second));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+  ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
+  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  db_->Put(WriteOptions(), "b2", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+    ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
+  db_->Put(WriteOptions(), "unused", "val");  // prevents empty after compaction
+  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                  true /* disallow_trivial_move */);
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
+  const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  Reopen(opts);
+
+  // Write a third before snapshot, a third between snapshot and tombstone, and
+  // a third after the tombstone. Keys older than snapshot or newer than the
+  // tombstone should be preserved.
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 3) {
+      snapshot = db_->GetSnapshot();
+    } else if (i == 2 * kNum / 3) {
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+  }
+  db_->Flush(FlushOptions());
+
+  for (int i = 0; i < kNum; ++i) {
+    ReadOptions read_opts;
+    read_opts.ignore_range_deletions = true;
+    std::string value;
+    if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
+      ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
+    } else {
+      ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
+  const int kNumPerFile = 100, kNumFiles = 4;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  Reopen(opts);
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    if (i > 0) {
+      // range tombstone covers first half of the previous file
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr((i - 1) * kNumPerFile),
+                       GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
+    }
+    // Make sure a given key appears in each file so compaction won't be able to
+    // use trivial move, which would happen if the ranges were non-overlapping.
+    // Also, we need an extra element since flush is only triggered when the
+    // number of keys is one greater than SpecialSkipListFactory's limit.
+    // We choose a key outside the key-range used by the test to avoid conflict.
+    db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
+
+    for (int j = 0; j < kNumPerFile; ++j) {
+      db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
+            TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ReadOptions read_opts;
+      read_opts.ignore_range_deletions = true;
+      std::string value;
+      if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
+        ASSERT_OK(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
+      } else {
+        ASSERT_TRUE(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
+                .IsNotFound());
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_bytes_for_level_base = 2 * kFileBytes;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 3;
+  options.target_file_size_base = kFileBytes;
+  options.target_file_size_multiplier = 1;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumFiles; ++j) {
+      if (i > 0) {
+        // delete [95,105) in two files, [295,305) in next two
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                         Key(mid - 5), Key(mid + 5));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(RandomString(&rnd, 990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      dbfull()->TEST_WaitForFlushMemTable();
+      if (j < kNumFiles - 1) {
+        // background compaction may happen early for kNumFiles'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+      if (j == options.level0_file_num_compaction_trigger - 1) {
+        // When i == 1, compaction will output some files to L1, at which point
+        // L1 is not bottommost so range deletions cannot be compacted away. The
+        // new L1 files must be generated with non-overlapping key ranges even
+        // though multiple subcompactions see the same ranges deleted, else an
+        // assertion will fail.
+        //
+        // Only enable auto-compactions when we're ready; otherwise, the
+        // oversized L0 (relative to base_level) causes the compaction to run
+        // earlier.
+        ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+        dbfull()->TEST_WaitForCompact();
+        ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                  {{"disable_auto_compactions", "true"}}));
+        ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+        ASSERT_GT(NumTableFilesAtLevel(1), 0);
+        ASSERT_GT(NumTableFilesAtLevel(2), 0);
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
+  Options options = CurrentOptions();
+  options.compaction_options_universal.min_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.max_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kFilesPerLevel;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.num_levels = kNumLevels;
+  options.target_file_size_base = kNumPerFile << 10;
+  options.target_file_size_multiplier = 1;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevels - 1; ++i) {
+    for (int j = 0; j < kFilesPerLevel; ++j) {
+      if (i == kNumLevels - 2) {
+        // insert range deletions [95,105) in two files, [295,305) in next two
+        // to prepare L1 for later manual compaction.
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                         Key(mid - 5), Key(mid + 5));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(RandomString(&rnd, 990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      dbfull()->TEST_WaitForFlushMemTable();
+      if (j < kFilesPerLevel - 1) {
+        // background compaction may happen early for kFilesPerLevel'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+    }
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+  }
+  // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
+  // happen since input level > 0; (2) range deletions are not dropped since
+  // output level is not bottommost. If no file boundary assertion fails, that
+  // probably means universal compaction + subcompaction + range deletion are
+  // compatible.
+  ASSERT_OK(dbfull()->RunManualCompaction(
+      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+          ->cfd(),
+      1 /* input_level */, 2 /* output_level */, 0 /* output_path_id */,
+      0 /* max_subcompactions */, nullptr /* begin */, nullptr /* end */,
+      true /* exclusive */, true /* disallow_trivial_move */));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
+  const int kNumPerFile = 3, kNumFiles = 3;
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile));
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  opts.num_levels = 2;
+  Reopen(opts);
+
+  // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
+  // requires an extra entry.
+  for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
+    if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
+      // Delete merge operands from all but the last file
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                       "key_");
+    }
+    std::string val;
+    PutFixed64(&val, i);
+    db_->Merge(WriteOptions(), "key", val);
+    // we need to prevent trivial move using Puts so compaction will actually
+    // process the merge operands.
+    db_->Put(WriteOptions(), "prevent_trivial_move", "");
+    if (i > 0 && i % kNumPerFile == 0) {
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+  }
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 1+2+...+9
+  ASSERT_EQ(expected, actual);
+
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  expected.clear();
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  uint64_t tmp;
+  Slice tmp2(actual);
+  GetFixed64(&tmp2, &tmp);
+  PutFixed64(&expected, 30);  // 6+7+8+9 (earlier operands covered by tombstone)
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "key", "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
+  // During compaction to bottommost level, verify range tombstones older than
+  // the oldest snapshot are removed, while others are preserved.
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  Reopen(opts);
+
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                   "dr10");  // obsolete after compaction
+  db_->Put(WriteOptions(), "key", "val");
+  db_->Flush(FlushOptions());
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+                   "dr20");  // protected by snapshot
+  db_->Put(WriteOptions(), "key", "val");
+  db_->Flush(FlushOptions());
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
+  // The RangeDelAggregator holds pointers into range deletion blocks created by
+  // table readers. This test ensures the aggregator can still access those
+  // blocks even if it outlives the table readers that created them.
+  //
+  // DBIter always keeps readers open for L0 files. So, in order to test
+  // aggregator outliving reader, we need to have deletions in L1 files, which
+  // are opened/closed on-demand during the scan. This is accomplished by
+  // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
+  // from all lingering in L0 (there is at most one range deletion per L0 file).
+  //
+  // The first L1 file will contain a range deletion since its begin key is 0.
+  // SeekToFirst() references that table's reader and adds its range tombstone
+  // to the aggregator. Upon advancing beyond that table's key-range via Next(),
+  // the table reader will be unreferenced by the iterator. Since we manually
+  // call Evict() on all readers before the full scan, this unreference causes
+  // the reader's refcount to drop to zero and thus be destroyed.
+  //
+  // When it is destroyed, we do not remove its range deletions from the
+  // aggregator. So, subsequent calls to Next() must be able to use these
+  // deletions to decide whether a key is covered. This will work as long as
+  // the aggregator properly references the range deletion block.
+  const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.level0_stop_writes_trigger = 4;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  opts.num_levels = 2;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.block_cache = NewLRUCache(8 << 20);
+  opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(opts);
+
+  // Hold a snapshot so range deletions can't become obsolete during compaction
+  // to bottommost level (i.e., L1).
+  const Snapshot* snapshot = db_->GetSnapshot();
+  for (int i = 0; i < kNum; ++i) {
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    if (i > 0) {
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+    if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+  }
+  // Must be > 1 so the first L1 file can be closed before scan finishes
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+  int expected = kRangeEnd;
+  iter->SeekToFirst();
+  for (auto file_number : file_numbers) {
+    // This puts table caches in the state of being externally referenced only
+    // so they are destroyed immediately upon iterator unreferencing.
+    TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
+  }
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+    // Keep clearing block cache's LRU so range deletion block can be freed as
+    // soon as its refcount drops to zero.
+    bbto.block_cache->EraseUnRefEntries();
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
+  do {
+    Options opts = CurrentOptions();
+    opts.max_write_buffer_number = 3;
+    opts.min_write_buffer_number_to_merge = 2;
+    // SpecialSkipListFactory lets us specify maximum number of elements the
+    // memtable can hold. It switches the active memtable to immutable (flush is
+    // prevented by the above options) upon inserting an element that would
+    // overflow the memtable.
+    opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+    DestroyAndReopen(opts);
+
+    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    db_->Put(WriteOptions(), "blah", "val");
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    db_->Put(WriteOptions(), "key", "val");
+    // snapshot prevents key from being deleted during flush
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    db_->ReleaseSnapshot(snapshot);
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
+  const int kNumMergeOps = 10;
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  for (int i = 0; i < kNumMergeOps; ++i) {
+    std::string val;
+    PutFixed64(&val, i);
+    db_->Merge(WriteOptions(), "key", val);
+    if (i == kNumMergeOps / 2) {
+      // deletes [0, 5]
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                       "key_");
+    }
+  }
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 30);  // 6+7+8+9
+  ASSERT_EQ(expected, actual);
+
+  expected.clear();
+  read_opts.ignore_range_deletions = true;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 0+1+2+...+9
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  Reopen(opts);
+
+  db_->Put(WriteOptions(), "sst_key", "val");
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, key, &value));
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  Reopen(opts);
+
+  // Write half of the keys before the tombstone and half after the tombstone.
+  // Only covered keys (i.e., within the range and older than the tombstone)
+  // should be deleted.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+  }
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    if (expected == kRangeBegin - 1) {
+      expected = kNum / 2;
+    } else {
+      ++expected;
+    }
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  Reopen(opts);
+
+  const Snapshot* snapshot = nullptr;
+  // Put a snapshot before the range tombstone, verify an iterator using that
+  // snapshot sees all inserted keys.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      snapshot = db_->GetSnapshot();
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+    }
+    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+  }
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+  }
+  ASSERT_EQ(kNum / 2, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  Reopen(opts);
+
+  db_->Put(WriteOptions(), "sst_key", "val");
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  auto* iter = db_->NewIterator(read_opts);
+  int i = 0;
+  std::string expected[] = {"imm_key", "mem_key", "sst_key"};
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
+    std::string key;
+    ASSERT_EQ(expected[i], iter->key());
+  }
+  ASSERT_EQ(3, i);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+#ifndef ROCKSDB_UBSAN_RUN
+TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
+  db_->Put(WriteOptions(), "key", "val");
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  // iterations check unsupported in memtable, l0, and then l1
+  for (int i = 0; i < 3; ++i) {
+    ReadOptions read_opts;
+    read_opts.tailing = true;
+    auto* iter = db_->NewIterator(read_opts);
+    if (i == 2) {
+      // For L1+, iterators over files are created on-demand, so need seek
+      iter->SeekToFirst();
+    }
+    ASSERT_TRUE(iter->status().IsNotSupported());
+    delete iter;
+    if (i == 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else if (i == 1) {
+      MoveFilesToLevel(1);
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+#endif  // !ROCKSDB_UBSAN_RUN
+
+TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
+  const int kNumFiles = 2, kNumKeysPerFile = 4;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_subcompactions = 2;
+  options.num_levels = 2;
+  options.target_file_size_base = 4096;
+  Reopen(options);
+
+  // need a L1 file for subcompaction to be triggered
+  ASSERT_OK(
+      db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+
+  // put enough keys to fill up the first subcompaction, and later range-delete
+  // them so that the first subcompaction outputs no key-values. In that case
+  // it'll consider making an SST file dedicated to range deletions.
+  for (int i = 0; i < kNumKeysPerFile; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       std::string(1024, 'a')));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeysPerFile)));
+
+  // the above range tombstone can be dropped, so that one alone won't cause a
+  // dedicated file to be opened. We can make one protected by snapshot that
+  // must be considered. Make its range outside the first subcompaction's range
+  // to exercise the tricky part of the code.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             Key(kNumKeysPerFile + 1),
+                             Key(kNumKeysPerFile + 2)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  db_->EnableAutoCompaction({db_->DefaultColumnFamily()});
+  dbfull()->TEST_WaitForCompact();
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MemtableBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kNumKeys = 1000;
+  const int kPrefixLen = 8;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  Reopen(options);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  Flush();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeys)));
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+  // This test originally verified that compaction treated files containing a
+  // split range deletion in the input level as an atomic unit. I.e.,
+  // compacting any input-level file(s) containing a portion of the range
+  // deletion causes all other input-level files containing portions of that
+  // same range deletion to be included in the compaction. Range deletion
+  // tombstones are now truncated to sstable boundaries which removed the need
+  // for that behavior (which could lead to excessively large
+  // compactions).
+  const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  // i == 0: CompactFiles
+  // i == 1: CompactRange
+  // i == 2: automatic compaction
+  for (int i = 0; i < 3; ++i) {
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+    ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                     Key(2 * kNumFilesPerLevel));
+
+    Random rnd(301);
+    std::string value = RandomString(&rnd, kValueBytes);
+    for (int j = 0; j < kNumFilesPerLevel; ++j) {
+      // give files overlapping key-ranges to prevent trivial move
+      ASSERT_OK(Put(Key(j), value));
+      ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+      if (j > 0) {
+        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_EQ(j, NumTableFilesAtLevel(0));
+      }
+    }
+    // put extra key to trigger final flush
+    ASSERT_OK(Put("", ""));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    if (i == 0) {
+      ASSERT_OK(db_->CompactFiles(
+          CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    } else if (i == 1) {
+      auto begin_str = Key(0), end_str = Key(1);
+      Slice begin = begin_str, end = end_str;
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+      ASSERT_EQ(3, NumTableFilesAtLevel(1));
+    } else if (i == 2) {
+      ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                {{"max_bytes_for_level_base", "10000"}}));
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+  // Test the handling of the range-tombstone end-key as the
+  // upper-bound for an sstable.
+
+  const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+
+  // Create an initial sstable at L2:
+  //   [key000000#1,1, key000000#1,1]
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // A snapshot protects the range tombstone from dropping due to
+  // becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                   Key(0), Key(2 * kNumFilesPerLevel));
+
+  // Create 2 additional sstables in L0. Note that the first sstable
+  // contains the range tombstone.
+  //   [key000000#3,1, key000004#72057594037927935,15]
+  //   [key000001#5,1, key000002#6,1]
+  Random rnd(301);
+  std::string value = RandomString(&rnd, kValueBytes);
+  for (int j = 0; j < kNumFilesPerLevel; ++j) {
+    // Give files overlapping key-ranges to prevent a trivial move when we
+    // compact from L0 to L1.
+    ASSERT_OK(Put(Key(j), value));
+    ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+  }
+  // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+  // are 2 sstables generated in L1 due to the target_file_size_base setting.
+  //   L1:
+  //     [key000000#3,1, key000002#72057594037927935,15]
+  //     [key000002#6,1, key000004#72057594037927935,15]
+  //   L2:
+  //     [key000000#1,1, key000000#1,1]
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  {
+    // Compact the second sstable in L1:
+    //   L1:
+    //     [key000000#3,1, key000002#72057594037927935,15]
+    //   L2:
+    //     [key000000#1,1, key000000#1,1]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
+    auto begin_str = Key(3);
+    const rocksdb::Slice begin = begin_str;
+    dbfull()->TEST_CompactRange(1, &begin, nullptr);
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
+  }
+
+  {
+    // Compact the first sstable in L1. This should be copacetic, but
+    // was previously resulting in overlapping sstables in L2 due to
+    // mishandling of the range tombstone end-key when used as the
+    // largest key for an sstable. The resulting LSM structure should
+    // be:
+    //
+    //   L2:
+    //     [key000000#1,1, key000001#72057594037927935,15]
+    //     [key000001#5,1, key000002#72057594037927935,15]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    auto begin_str = Key(0);
+    const rocksdb::Slice begin = begin_str;
+    dbfull()->TEST_CompactRange(1, &begin, &begin);
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UnorderedTombstones) {
+  // Regression test for #2752. Range delete tombstones between
+  // different snapshot stripes are not stored in order, so the first
+  // tombstone of each snapshot stripe should be checked as a smallest
+  // candidate.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  auto cf = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
+  // Hold a snapshot to separate these two delete ranges.
+  auto snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  db_->ReleaseSnapshot(snapshot);
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(cf, &files);
+  ASSERT_EQ(1, files[0].size());
+  ASSERT_EQ("a", files[0][0].smallest.user_key());
+  ASSERT_EQ("c", files[0][0].largest.user_key());
+
+  std::string v;
+  auto s = db_->Get(ReadOptions(), "a", &v);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+class MockMergeOperator : public MergeOperator {
+  // Mock non-associative operator. Non-associativity is expressed by lack of
+  // implementation for any `PartialMerge*` functions.
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    assert(merge_out != nullptr);
+    merge_out->new_value = merge_in.operand_list.back().ToString();
+    return true;
+  }
+
+  const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+  // This test uses a non-associative merge operator since that is a convenient
+  // way to get compaction to write out files with overlapping user-keys at the
+  // endpoints. Note, however, overlapping endpoints can also occur with other
+  // value types (Put, etc.), assuming the right snapshots are present.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // Push dummy data to L3 so that our actual test files on L0-L2
+  // will not be considered "bottommost" level, otherwise compaction
+  // may prevent us from creating overlapping user keys
+  // as on the bottommost layer MergeHelper
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+    }
+    if (i == kNumFiles - 1) {
+      // Take snapshot to prevent covered merge operands from being dropped by
+      // compaction.
+      snapshot = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Now we have multiple files at L1 all containing a single user key, thus
+  // guaranteeing overlap in the file endpoints.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Verify no merge operands reappeared after the compaction.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  // Compact and verify again. It's worthwhile because now the files have
+  // tighter endpoints, so we can verify that doesn't mess anything up.
+  dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_GT(NumTableFilesAtLevel(2), 1);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+  // Verify a key newer than a range tombstone cannot be deleted by being
+  // compacted to the bottom level (and thus having its seqnum zeroed) before
+  // the range tombstone. This used to happen when range tombstones were
+  // untruncated on reads such that they extended past their file boundaries.
+  //
+  // Test summary:
+  //
+  // - L1 is bottommost.
+  // - A couple snapshots are strategically taken to prevent seqnums from being
+  //   zeroed, range tombstone from being dropped, merge operands from being
+  //   dropped, and merge operands from being combined.
+  // - Left half of files in L1 all have same user key, ensuring their file
+  //   boundaries overlap. In the past this would cause range tombstones to be
+  //   untruncated.
+  // - Right half of L1 files all have different keys, ensuring no overlap.
+  // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+  // - Keys in the right side of the key-range are overwritten. These are
+  //   compacted down to L1 after releasing snapshots such that their seqnums
+  //   will be zeroed.
+  // - A full range scan is performed. If the tombstone in the left L1 files
+  //   were untruncated, it would now cover keys newer than it (but with zeroed
+  //   seqnums) in the right L1 files.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+  const int kMaxKey = kNumFiles* kFileBytes / kValueBytes;
+  const int kKeysOverwritten = 10;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  // - snapshots[0] prevents merge operands from being combined during
+  //   compaction.
+  // - snapshots[1] prevents merge operands from being dropped due to the
+  //   covering range tombstone.
+  const Snapshot* snapshots[] = {nullptr, nullptr};
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      std::string key;
+      if (i < kNumFiles / 2) {
+        key = Key(0);
+      } else {
+        key = Key(1 + i * kFileBytes / kValueBytes + j);
+      }
+      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+    }
+    if (i == 0) {
+      snapshots[0] = db_->GetSnapshot();
+    }
+    if (i == kNumFiles - 1) {
+      snapshots[1] = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(0), Key(kMaxKey + 1)));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  auto get_key_count = [this]() -> int {
+    auto* iter = db_->NewIterator(ReadOptions());
+    iter->SeekToFirst();
+    int keys_found = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ++keys_found;
+    }
+    delete iter;
+    return keys_found;
+  };
+
+  // All keys should be covered
+  ASSERT_EQ(0, get_key_count());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Roughly the left half of L1 files should have overlapping boundary keys,
+  // while the right half should not.
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  // Now overwrite a few keys that are in L1 files that definitely don't have
+  // overlapping boundary keys.
+  for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+    auto value = RandomString(&rnd, kValueBytes);
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // The overwritten keys are in L0 now, so clearly aren't covered by the range
+  // tombstone in L1.
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+  // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+  db_->ReleaseSnapshot(snapshots[0]);
+  db_->ReleaseSnapshot(snapshots[1]);
+
+  auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+  auto end_key_storage = Key(kMaxKey);
+  Slice begin_key(begin_key_storage);
+  Slice end_key(end_key_storage);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+  // Exposes a bug where we were using
+  // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+  // in the forward direction. Confusingly, this case happened during
+  // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  // Need multiple keys so we can get results when calling `Prev()` after
+  // `SeekToLast()`.
+  const int kNumKeys = 3;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+      if (i == 0 && j == kNumKeys) {
+        // Take snapshot to prevent covered merge operands from being dropped or
+        // merged by compaction.
+        snapshot = db_->GetSnapshot();
+        // Do a DeleteRange near the beginning so only the oldest merge operand
+        // for each key is covered. This ensures the sequence of events:
+        //
+        // - `DBIter::Prev()` is called
+        // - After several same versions of the same user key are encountered,
+        //   it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+        // - Binary searches to the newest version of the key, which is in the
+        //   leftmost file containing the user key.
+        // - Scans forwards to collect all merge operands. Eventually reaches
+        //   the rightmost file containing the oldest merge operand, which
+        //   should be covered by the `DeleteRange`. If `RangeDelAggregator`
+        //   were not properly using `kForwardTraversal` here, that operand
+        //   would reappear.
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(0), Key(kNumKeys + 1)));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  int keys_found = 0;
+  for (; iter->Valid(); iter->Prev()) {
+    ++keys_found;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, keys_found);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  db_->Flush(FlushOptions());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    db_->Flush(FlushOptions());
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  db_->Flush(FlushOptions());
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
+                              &end_key /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}